From 07c57112dbf56b956cd4d0033f708f36923ed962 Mon Sep 17 00:00:00 2001 From: abramovich Date: Sun, 28 May 2023 10:28:57 +0300 Subject: [PATCH 1/4] primary docs --- .../impl/neighbours/TokensPipelineExecutor.kt | 23 +++++++++++++++++++ ...Executor.kt => WindowsPipelineExecutor.kt} | 2 +- .../indexing/InvertedIndexBuilder.kt | 1 + .../indexing/WindowedTokenCreator.kt | 6 ++--- .../neighbours/search/NeighboursSearcher.kt | 19 +++++++++++---- .../model/neighbours/NeighboursDocument.kt | 1 + .../repository/neighbours/InvertedIndex.kt | 16 +++++++++++++ .../PreprocessingPipelineExecutorBuilder.kt | 4 ++-- ...Test.kt => WindowsPipelineExecutorTest.kt} | 2 +- 9 files changed, 63 insertions(+), 11 deletions(-) create mode 100644 src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt rename src/main/kotlin/ru/itmo/stand/service/impl/neighbours/{PreprocessingPipelineExecutor.kt => WindowsPipelineExecutor.kt} (96%) rename src/test/kotlin/ru/itmo/stand/service/impl/neighbours/{PreprocessingPipelineExecutorTest.kt => WindowsPipelineExecutorTest.kt} (96%) diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt new file mode 100644 index 00000000..f5c39a79 --- /dev/null +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt @@ -0,0 +1,23 @@ +package ru.itmo.stand.service.impl.neighbours + +import org.springframework.stereotype.Service +import ru.itmo.stand.config.StandProperties +import ru.itmo.stand.service.preprocessing.ContextSplitter +import ru.itmo.stand.service.preprocessing.StopWordRemover +import ru.itmo.stand.service.preprocessing.TextCleaner +import ru.itmo.stand.service.preprocessing.Tokenizer +import ru.itmo.stand.util.Window + +@Service +class TokensPipelineExecutor( + private val stopWordRemover: StopWordRemover, + private val textCleaner: TextCleaner, + private val tokenizer: Tokenizer, +) { + + fun execute(content: String): List { + val cleanedContent = textCleaner.preprocess(content) + val tokens = tokenizer.preprocess(cleanedContent) + return stopWordRemover.preprocess(tokens) + } +} diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/PreprocessingPipelineExecutor.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/WindowsPipelineExecutor.kt similarity index 96% rename from src/main/kotlin/ru/itmo/stand/service/impl/neighbours/PreprocessingPipelineExecutor.kt rename to src/main/kotlin/ru/itmo/stand/service/impl/neighbours/WindowsPipelineExecutor.kt index d87f9857..a4e15d6b 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/PreprocessingPipelineExecutor.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/WindowsPipelineExecutor.kt @@ -9,7 +9,7 @@ import ru.itmo.stand.service.preprocessing.Tokenizer import ru.itmo.stand.util.Window @Service -class PreprocessingPipelineExecutor( +class WindowsPipelineExecutor( private val standProperties: StandProperties, private val contextSplitter: ContextSplitter, private val stopWordRemover: StopWordRemover, diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt index 094ef6b3..00602e32 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt @@ -68,6 +68,7 @@ class InvertedIndexBuilder( documentEmbeddingRepository.findByDocId(docId).embedding } NeighboursDocument( + token = contextualizedEmbedding.tokenWithEmbeddingId.split(ContextualizedEmbedding.TOKEN_AND_EMBEDDING_ID_SEPARATOR).first(), tokenWithEmbeddingId = contextualizedEmbedding.tokenWithEmbeddingId, docId = docId, score = documentEmbedding.dot(contextualizedEmbedding.embedding), diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/WindowedTokenCreator.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/WindowedTokenCreator.kt index 8e36c81a..3594b427 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/WindowedTokenCreator.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/WindowedTokenCreator.kt @@ -3,7 +3,7 @@ package ru.itmo.stand.service.impl.neighbours.indexing import io.github.oshai.KotlinLogging import org.springframework.stereotype.Service import ru.itmo.stand.config.StandProperties -import ru.itmo.stand.service.impl.neighbours.PreprocessingPipelineExecutor +import ru.itmo.stand.service.impl.neighbours.WindowsPipelineExecutor import ru.itmo.stand.service.model.Document import ru.itmo.stand.util.Window import ru.itmo.stand.util.createPath @@ -11,7 +11,7 @@ import java.io.File @Service class WindowedTokenCreator( - private val preprocessingPipelineExecutor: PreprocessingPipelineExecutor, + private val windowsPipelineExecutor: WindowsPipelineExecutor, private val standProperties: StandProperties, ) { @@ -78,7 +78,7 @@ class WindowedTokenCreator( } fun create(document: Document): List { - return preprocessingPipelineExecutor.execute(document.content) + return windowsPipelineExecutor.execute(document.content) } companion object { diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt index 4282f1eb..b533930e 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt @@ -2,26 +2,37 @@ package ru.itmo.stand.service.impl.neighbours.search import org.springframework.stereotype.Service import ru.itmo.stand.service.bert.BertEmbeddingCalculator -import ru.itmo.stand.service.impl.neighbours.PreprocessingPipelineExecutor +import ru.itmo.stand.service.impl.neighbours.TokensPipelineExecutor +import ru.itmo.stand.service.impl.neighbours.WindowsPipelineExecutor import ru.itmo.stand.storage.embedding.ContextualizedEmbeddingRepository import ru.itmo.stand.storage.lucene.repository.neighbours.InvertedIndex @Service class NeighboursSearcher( private val contextualizedEmbeddingRepository: ContextualizedEmbeddingRepository, - private val preprocessingPipelineExecutor: PreprocessingPipelineExecutor, + private val windowsPipelineExecutor: WindowsPipelineExecutor, private val bertEmbeddingCalculator: BertEmbeddingCalculator, private val invertedIndex: InvertedIndex, + private val tokensPipelineExecutor: TokensPipelineExecutor ) { fun search(query: String): List { - val windows = preprocessingPipelineExecutor.execute(query) + val tokens = tokensPipelineExecutor.execute(query) + + val primaryDocuments = invertedIndex.findByTokens(tokens) + + val windows = windowsPipelineExecutor.execute(query) val embeddings = bertEmbeddingCalculator.calculate(windows.map { it.toTranslatorInput() }.toTypedArray()) + return embeddings.flatMap { embedding -> contextualizedEmbeddingRepository.findByVector(embedding.toTypedArray()) } .let { contextualizedEmbeddings -> val tokenWithEmbeddingIds = contextualizedEmbeddings.map { it.tokenWithEmbeddingId } - invertedIndex.findByTokenWithEmbeddingIds(tokenWithEmbeddingIds).groupingBy { it.docId } + val secondaryDocuments = invertedIndex.findByTokenWithEmbeddingIds(tokenWithEmbeddingIds) + + sequenceOf(primaryDocuments, secondaryDocuments).flatten() + .groupingBy { it.docId } .foldTo(HashMap(), 0f) { acc, doc -> acc + doc.score } + }.entries .sortedByDescending { (_, score) -> score } .take(10) // TODO: configure this value diff --git a/src/main/kotlin/ru/itmo/stand/storage/lucene/model/neighbours/NeighboursDocument.kt b/src/main/kotlin/ru/itmo/stand/storage/lucene/model/neighbours/NeighboursDocument.kt index 4ddb8709..bd74d96d 100644 --- a/src/main/kotlin/ru/itmo/stand/storage/lucene/model/neighbours/NeighboursDocument.kt +++ b/src/main/kotlin/ru/itmo/stand/storage/lucene/model/neighbours/NeighboursDocument.kt @@ -1,6 +1,7 @@ package ru.itmo.stand.storage.lucene.model.neighbours data class NeighboursDocument( + val token: String, val tokenWithEmbeddingId: String, val docId: String, val score: Float, diff --git a/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt b/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt index c0c8c2a6..1237b1f7 100644 --- a/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt +++ b/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt @@ -7,6 +7,8 @@ import org.apache.lucene.document.StringField import org.apache.lucene.index.ConcurrentMergeScheduler import org.apache.lucene.index.IndexWriterConfig import org.apache.lucene.index.Term +import org.apache.lucene.search.BooleanQuery +import org.apache.lucene.search.BoostQuery import org.apache.lucene.search.TermQuery import org.springframework.stereotype.Repository import ru.itmo.stand.config.StandProperties @@ -29,6 +31,7 @@ class InvertedIndex(private val standProperties: StandProperties) : LuceneReposi fun save(entity: NeighboursDocument) { val document = Document() + document.add(StringField(NeighboursDocument::token.name, entity.token, YES)) document.add(StringField(NeighboursDocument::tokenWithEmbeddingId.name, entity.tokenWithEmbeddingId, YES)) document.add(StringField(NeighboursDocument::docId.name, entity.docId, YES)) document.add(StringField(NeighboursDocument::score.name, entity.score.toString(), YES)) @@ -39,14 +42,27 @@ class InvertedIndex(private val standProperties: StandProperties) : LuceneReposi entities.forEach { save(it) } } + fun findByTokens(tokens: Collection): Sequence { + val query = booleanQuery(tokens) { token-> + TermQuery(Term(NeighboursDocument::token.name, token)) + } + + return search(query) + } + fun findByTokenWithEmbeddingIds(tokenWithEmbeddingIds: Collection): Sequence { val query = booleanQuery(tokenWithEmbeddingIds) { tokenWithEmbeddingId -> TermQuery(Term(NeighboursDocument::tokenWithEmbeddingId.name, tokenWithEmbeddingId)) } + return search(query) + } + + private fun search(query: BooleanQuery): Sequence { return searcher.searchAll(query) .map { NeighboursDocument( + it.get(NeighboursDocument::token.name), it.get(NeighboursDocument::tokenWithEmbeddingId.name), it.get(NeighboursDocument::docId.name), it.get(NeighboursDocument::score.name).toFloat(), diff --git a/src/test/kotlin/ru/itmo/stand/fixtures/PreprocessingPipelineExecutorBuilder.kt b/src/test/kotlin/ru/itmo/stand/fixtures/PreprocessingPipelineExecutorBuilder.kt index 47b928b5..db9f00f2 100644 --- a/src/test/kotlin/ru/itmo/stand/fixtures/PreprocessingPipelineExecutorBuilder.kt +++ b/src/test/kotlin/ru/itmo/stand/fixtures/PreprocessingPipelineExecutorBuilder.kt @@ -1,10 +1,10 @@ package ru.itmo.stand.fixtures -import ru.itmo.stand.service.impl.neighbours.PreprocessingPipelineExecutor +import ru.itmo.stand.service.impl.neighbours.WindowsPipelineExecutor import ru.itmo.stand.service.preprocessing.ContextSplitter import ru.itmo.stand.service.preprocessing.StopWordRemover -fun preprocessingPipelineExecutor(): PreprocessingPipelineExecutor = PreprocessingPipelineExecutor( +fun preprocessingPipelineExecutor(): WindowsPipelineExecutor = WindowsPipelineExecutor( standProperties(), ContextSplitter(), StopWordRemover(), diff --git a/src/test/kotlin/ru/itmo/stand/service/impl/neighbours/PreprocessingPipelineExecutorTest.kt b/src/test/kotlin/ru/itmo/stand/service/impl/neighbours/WindowsPipelineExecutorTest.kt similarity index 96% rename from src/test/kotlin/ru/itmo/stand/service/impl/neighbours/PreprocessingPipelineExecutorTest.kt rename to src/test/kotlin/ru/itmo/stand/service/impl/neighbours/WindowsPipelineExecutorTest.kt index b27038cd..6487d7f8 100644 --- a/src/test/kotlin/ru/itmo/stand/service/impl/neighbours/PreprocessingPipelineExecutorTest.kt +++ b/src/test/kotlin/ru/itmo/stand/service/impl/neighbours/WindowsPipelineExecutorTest.kt @@ -5,7 +5,7 @@ import org.junit.jupiter.api.Test import ru.itmo.stand.fixtures.preprocessingPipelineExecutor import ru.itmo.stand.util.Window -class PreprocessingPipelineExecutorTest { +class WindowsPipelineExecutorTest { private val preprocessingPipelineExecutor = preprocessingPipelineExecutor() From a62ad883a49ee9617b7c81530c290e1534cee41c Mon Sep 17 00:00:00 2001 From: abramovich Date: Sun, 28 May 2023 10:29:18 +0300 Subject: [PATCH 2/4] update --- .../stand/service/impl/neighbours/TokensPipelineExecutor.kt | 3 --- .../stand/service/impl/neighbours/search/NeighboursSearcher.kt | 3 +-- .../storage/lucene/repository/neighbours/InvertedIndex.kt | 3 +-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt index f5c39a79..7223d7a4 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/TokensPipelineExecutor.kt @@ -1,12 +1,9 @@ package ru.itmo.stand.service.impl.neighbours import org.springframework.stereotype.Service -import ru.itmo.stand.config.StandProperties -import ru.itmo.stand.service.preprocessing.ContextSplitter import ru.itmo.stand.service.preprocessing.StopWordRemover import ru.itmo.stand.service.preprocessing.TextCleaner import ru.itmo.stand.service.preprocessing.Tokenizer -import ru.itmo.stand.util.Window @Service class TokensPipelineExecutor( diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt index b533930e..37d07f04 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/search/NeighboursSearcher.kt @@ -13,7 +13,7 @@ class NeighboursSearcher( private val windowsPipelineExecutor: WindowsPipelineExecutor, private val bertEmbeddingCalculator: BertEmbeddingCalculator, private val invertedIndex: InvertedIndex, - private val tokensPipelineExecutor: TokensPipelineExecutor + private val tokensPipelineExecutor: TokensPipelineExecutor, ) { fun search(query: String): List { @@ -32,7 +32,6 @@ class NeighboursSearcher( sequenceOf(primaryDocuments, secondaryDocuments).flatten() .groupingBy { it.docId } .foldTo(HashMap(), 0f) { acc, doc -> acc + doc.score } - }.entries .sortedByDescending { (_, score) -> score } .take(10) // TODO: configure this value diff --git a/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt b/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt index 1237b1f7..3c296a75 100644 --- a/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt +++ b/src/main/kotlin/ru/itmo/stand/storage/lucene/repository/neighbours/InvertedIndex.kt @@ -8,7 +8,6 @@ import org.apache.lucene.index.ConcurrentMergeScheduler import org.apache.lucene.index.IndexWriterConfig import org.apache.lucene.index.Term import org.apache.lucene.search.BooleanQuery -import org.apache.lucene.search.BoostQuery import org.apache.lucene.search.TermQuery import org.springframework.stereotype.Repository import ru.itmo.stand.config.StandProperties @@ -43,7 +42,7 @@ class InvertedIndex(private val standProperties: StandProperties) : LuceneReposi } fun findByTokens(tokens: Collection): Sequence { - val query = booleanQuery(tokens) { token-> + val query = booleanQuery(tokens) { token -> TermQuery(Term(NeighboursDocument::token.name, token)) } From 63fb3967ff3ac99f463c008807ba9c8882373ccc Mon Sep 17 00:00:00 2001 From: abramovich Date: Mon, 29 May 2023 21:35:43 +0300 Subject: [PATCH 3/4] stupid tf idf impl --- .../neighbours/DocumentNeighboursService.kt | 2 +- .../indexing/InvertedIndexBuilder.kt | 47 +++++++++++++++++-- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/DocumentNeighboursService.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/DocumentNeighboursService.kt index 89bb0bd3..d31acf8b 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/DocumentNeighboursService.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/DocumentNeighboursService.kt @@ -50,7 +50,7 @@ class DocumentNeighboursService( documentEmbeddingCreator.create(contents.documentSequenceWithSpecifiedCount()) val windowedTokensFile = windowedTokenCreator.create(contents.documentSequenceWithSpecifiedCount()) vectorIndexBuilder.index(windowedTokensFile) - invertedIndexBuilder.index(windowedTokensFile) + invertedIndexBuilder.index(windowedTokensFile, contents.documentSequenceWithSpecifiedCount()) return emptyList() } diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt index 00602e32..8475cde6 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt @@ -4,6 +4,8 @@ import io.github.oshai.KotlinLogging import org.springframework.stereotype.Service import ru.itmo.stand.service.bert.BertEmbeddingCalculator import ru.itmo.stand.service.bert.TranslatorInput +import ru.itmo.stand.service.impl.neighbours.TokensPipelineExecutor +import ru.itmo.stand.service.model.Document import ru.itmo.stand.storage.embedding.ContextualizedEmbeddingRepository import ru.itmo.stand.storage.embedding.model.ContextualizedEmbedding import ru.itmo.stand.storage.lucene.model.neighbours.NeighboursDocument @@ -11,6 +13,7 @@ import ru.itmo.stand.storage.lucene.repository.neighbours.DocumentEmbeddingRepos import ru.itmo.stand.storage.lucene.repository.neighbours.InvertedIndex import ru.itmo.stand.util.dot import java.io.File +import kotlin.math.ln @Service class InvertedIndexBuilder( @@ -18,12 +21,18 @@ class InvertedIndexBuilder( private val documentEmbeddingRepository: DocumentEmbeddingRepository, private val embeddingCalculator: BertEmbeddingCalculator, private val invertedIndex: InvertedIndex, + private val tokensPipelineExecutor: TokensPipelineExecutor ) { private val log = KotlinLogging.logger { } private val documentEmbeddingCache = HashMap() - fun index(windowedTokensFile: File) { + + + fun index(windowedTokensFile: File, documents: Sequence) { + + val (tf, idf) = getTfIdf(documents) + val tokensWithWindows = readTokensWindowsAndDocIds(windowedTokensFile) tokensWithWindows.onEachIndexed { index, token -> @@ -34,13 +43,40 @@ class InvertedIndexBuilder( embeddingCalculator.calculate(windows, BERT_BATCH_SIZE).forEachIndexed { index, embedding -> val docIds = docIdsList[index] contextualizedEmbeddingRepository.findByVector(embedding.toTypedArray()) - .forEach { computeScoreAndSave(docIds, it) } + .forEach { computeScoreAndSave(docIds, it, tf, idf) } } } invertedIndex.completeIndexing() } + private fun getTfIdf(documents: Sequence): Pair>,Map > { + + val df = mutableMapOf>() + val tf = mutableMapOf>() + + var docsCount = 0.0 + for(doc in documents){ + docsCount++ + val tokens = tokensPipelineExecutor.execute(doc.content) + tf[doc.id] = mutableMapOf() + + tokens.forEach { + df.putIfAbsent(it, hashSetOf()) + df[it]?.add(doc.id) + + tf[doc.id]?.putIfAbsent(it, 0.0) + tf[doc.id]?.set(it, tf[doc.id]?.get(it)?.plus((1.0 / tokens.size)) ?: 0.0) + } + } + + val idf = df.map { + it.key to ln(docsCount / it.value.size) + }.toMap() + + return Pair(tf, idf) + } + private fun readTokensWindowsAndDocIds(windowedTokensFile: File) = windowedTokensFile .bufferedReader() .lineSequence() @@ -62,16 +98,19 @@ class InvertedIndexBuilder( private fun computeScoreAndSave( docIds: List, contextualizedEmbedding: ContextualizedEmbedding, + tf: Map>, + idf: Map, ) { + val token = contextualizedEmbedding.tokenWithEmbeddingId.split(ContextualizedEmbedding.TOKEN_AND_EMBEDDING_ID_SEPARATOR).first() val neighboursDocuments = docIds.map { docId -> val documentEmbedding = documentEmbeddingCache.computeIfAbsent(docId) { documentEmbeddingRepository.findByDocId(docId).embedding } NeighboursDocument( - token = contextualizedEmbedding.tokenWithEmbeddingId.split(ContextualizedEmbedding.TOKEN_AND_EMBEDDING_ID_SEPARATOR).first(), + token = token, tokenWithEmbeddingId = contextualizedEmbedding.tokenWithEmbeddingId, docId = docId, - score = documentEmbedding.dot(contextualizedEmbedding.embedding), + score = documentEmbedding.dot(contextualizedEmbedding.embedding) * (tf[docId]?.get(token)?.toFloat() ?: 1.0f) * (idf[token]?.toFloat() ?: 1.0f), ) } invertedIndex.saveAll(neighboursDocuments) From 7f5780625001c605b170f87e2c61ba2b8fcc3981 Mon Sep 17 00:00:00 2001 From: abramovich Date: Mon, 29 May 2023 21:36:26 +0300 Subject: [PATCH 4/4] fix --- .../impl/neighbours/indexing/InvertedIndexBuilder.kt | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt index 8475cde6..8ce8c8b3 100644 --- a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt +++ b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/InvertedIndexBuilder.kt @@ -21,16 +21,13 @@ class InvertedIndexBuilder( private val documentEmbeddingRepository: DocumentEmbeddingRepository, private val embeddingCalculator: BertEmbeddingCalculator, private val invertedIndex: InvertedIndex, - private val tokensPipelineExecutor: TokensPipelineExecutor + private val tokensPipelineExecutor: TokensPipelineExecutor, ) { private val log = KotlinLogging.logger { } private val documentEmbeddingCache = HashMap() - - fun index(windowedTokensFile: File, documents: Sequence) { - val (tf, idf) = getTfIdf(documents) val tokensWithWindows = readTokensWindowsAndDocIds(windowedTokensFile) @@ -50,13 +47,12 @@ class InvertedIndexBuilder( invertedIndex.completeIndexing() } - private fun getTfIdf(documents: Sequence): Pair>,Map > { - + private fun getTfIdf(documents: Sequence): Pair>, Map> { val df = mutableMapOf>() val tf = mutableMapOf>() var docsCount = 0.0 - for(doc in documents){ + for (doc in documents) { docsCount++ val tokens = tokensPipelineExecutor.execute(doc.content) tf[doc.id] = mutableMapOf()