fix: join chunks

itmo-ml · Apr 7, 2023 · f82828b · f82828b
1 parent 596b2ed
commit f82828b
Show file tree

Hide file tree

Showing 9 changed files with 40 additions and 45 deletions.
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -67,12 +67,12 @@ tasks.withType<Test> {
 }
 
 jmh {
-    includes.set(listOf("BertHyperParameterBenchmark")) // include pattern (regular expression) for benchmarks to be executed
+    includes.set(listOf(".*")) // include pattern (regular expression) for benchmarks to be executed
     warmupIterations.set(2) // Number of warmup iterations to do
     iterations.set(2) // Number of measurement iterations to do
     fork.set(2) // How many times to forks a single benchmark. Use 0 to disable forking altogether
     zip64.set(true) // is used for big archives (more than 65535 entries)
-    resultsFile.set(project.file("${project.buildDir}/reports/jmh/results.txt")) // results file
+    resultsFile.set(project.file("${project.buildDir}/outputs/jmh/results.txt")) // results file
 }
 
 ktlint {

diff --git a/src/jmh/kotlin/ru/itmo/stand/service/bert/BertHyperParameterBenchmark.kt b/src/jmh/kotlin/ru/itmo/stand/service/bert/BertHyperParameterBenchmark.kt
@@ -50,7 +50,6 @@ open class BertHyperParameterBenchmark {
 
     private val testContents = generateWindows()
 
-/*
     @Benchmark
     fun singleThreadBenchmark_100(): Array<FloatArray> {
         return singleThreadBenchmark(100)
@@ -75,14 +74,12 @@ open class BertHyperParameterBenchmark {
     fun singleThreadBenchmark_2000(): Array<FloatArray> {
         return singleThreadBenchmark(2000)
     }
-*/
 
-//    @Benchmark
-//    fun singleThreadBenchmark_5000(): Array<FloatArray> {
-//        return singleThreadBenchmark(5000)
-//    }
+    @Benchmark
+    fun singleThreadBenchmark_5000(): Array<FloatArray> {
+        return singleThreadBenchmark(5000)
+    }
 
-/*
     @Benchmark
     fun singleThreadBenchmark_10_000(): Array<FloatArray> {
         return singleThreadBenchmark(10_000)
@@ -106,7 +103,7 @@ open class BertHyperParameterBenchmark {
     @Benchmark
     fun singleThreadBenchmark_50_000(): Array<FloatArray> {
         return singleThreadBenchmark(50_000)
-    }*/
+    }
 
     @Benchmark
     fun multithreadedBenchmark_4_5000(): Array<FloatArray> {
@@ -120,28 +117,30 @@ open class BertHyperParameterBenchmark {
             }.toTypedArray()
     }
 
-    private fun multithreadedBenchmark(batchSize: Int, numThreads: Int): Array<FloatArray> = runBlocking(Dispatchers.Default) {
-        val counter = AtomicInteger(0)
-        val chan = Channel<List<String>>(numThreads)
-        repeat(numThreads) {
-            launch {
-                val predictor = tinyModel.newPredictor()
-                for (data in chan) {
-                    counter.incrementAndGet()
-                    predictor.predict(data.toTypedArray())
+    private fun multithreadedBenchmark(batchSize: Int, numThreads: Int): Array<FloatArray> =
+        runBlocking(Dispatchers.Default) {
+            val counter = AtomicInteger(0)
+            val chan = Channel<List<String>>(numThreads)
+            repeat(numThreads) {
+                launch {
+                    val predictor = tinyModel.newPredictor()
+                    for (data in chan) {
+                        counter.incrementAndGet()
+                        predictor.predict(data.toTypedArray())
+                    }
+                    predictor.close()
                 }
-                predictor.close()
             }
-        }
 
-        for (data in testContents.chunked(batchSize)) {
-            chan.send(data)
-        }
-        while (!chan.isEmpty) {}
-        chan.close()
+            for (data in testContents.chunked(batchSize)) {
+                chan.send(data)
+            }
+            while (!chan.isEmpty) {
+            }
+            chan.close()
 
-        arrayOf()
-    }
+            arrayOf()
+        }
 
     private fun generateWindows(count: Int = 50_000): List<String> {
         val result = mutableListOf<String>()

diff --git a/src/main/kotlin/ru/itmo/stand/command/SaveInBatchCommand.kt b/src/main/kotlin/ru/itmo/stand/command/SaveInBatchCommand.kt
@@ -45,7 +45,7 @@ class SaveInBatchCommand(
 
     override fun run() {
         val contents = Paths.get(contentFile.path)
-            .bufferedReader(bufferSize = standProperties.app.fileLoadBufferSizeKb * 1024)
+            .bufferedReader()
             .lineSequence()
 
         val seconds = measureTimeSeconds {

diff --git a/src/main/kotlin/ru/itmo/stand/config/StandProperties.kt b/src/main/kotlin/ru/itmo/stand/config/StandProperties.kt
@@ -16,7 +16,6 @@ data class StandProperties @ConstructorBinding constructor(
         val basePath: String,
         val bertMultiToken: BertMultiToken,
         val neighboursAlgorithm: NeighboursAlgorithm,
-        val fileLoadBufferSizeKb: Int,
     )
 
     data class BertMultiToken(

diff --git a/src/main/kotlin/ru/itmo/stand/service/bert/BertEmbeddingCalculator.kt b/src/main/kotlin/ru/itmo/stand/service/bert/BertEmbeddingCalculator.kt
@@ -9,6 +9,7 @@ class BertEmbeddingCalculator(
     private val standProperties: StandProperties,
 ) {
 
+    // TODO: configure to return vector for middle token
     private val predictor by lazy {
         bertModelLoader.loadModel(standProperties.app.neighboursAlgorithm.bertModelType).newPredictor()
     }

diff --git a/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/VectorIndexBuilder.kt b/src/main/kotlin/ru/itmo/stand/service/impl/neighbours/indexing/VectorIndexBuilder.kt
@@ -23,9 +23,9 @@ class VectorIndexBuilder(
     private val log = LoggerFactory.getLogger(javaClass)
 
     fun index(windowedTokensFile: File) {
+        log.info("Starting vector indexing")
         val windowsByTokenPairs = readWindowsByTokenPairs(windowedTokensFile)
 
-        log.info("starting vector indexing")
         val counter = AtomicInteger(0)
         val clusterSizes = AtomicInteger(0)
         val windowsCount = AtomicInteger(0)
@@ -37,11 +37,11 @@ class VectorIndexBuilder(
             counter.incrementAndGet()
         }
 
-        log.info("token count: ${counter.get()}")
-        log.info("cluster sizes: ${clusterSizes.get()}")
-        log.info("windows count: ${windowsCount.get()}")
-        log.info("mean windows per token: ${windowsCount.get().toDouble() / counter.get().toDouble()}")
-        log.info("mean cluster size is ${clusterSizes.get() / counter.get().toFloat()}")
+        log.info("Token count: ${counter.get()}")
+        log.info("Cluster sizes: ${clusterSizes.get()}")
+        log.info("Windows count: ${windowsCount.get()}")
+        log.info("Mean windows per token: ${windowsCount.get().toDouble() / counter.get().toDouble()}")
+        log.info("Mean cluster size is ${clusterSizes.get() / counter.get().toFloat()}")
     }
 
     private fun readWindowsByTokenPairs(windowedTokensFile: File) = windowedTokensFile
@@ -53,7 +53,7 @@ class VectorIndexBuilder(
             val windows = tokenAndWindows[1]
                 .split(WINDOWS_SEPARATOR)
                 .filter { it.isNotBlank() }
-                .take(1000)
+                .take(1000) // TODO: configure this value
             token to windows.map { it.split(WINDOW_DOC_IDS_SEPARATOR).first() }
         }
 
@@ -64,9 +64,9 @@ class VectorIndexBuilder(
 
         val doubleEmb = embeddings.toDoubleArray()
 
-        val clusterModel = XMeans.fit(doubleEmb, 8)
+        val clusterModel = XMeans.fit(doubleEmb, 8) // TODO: configure this value
 
-        log.info("{} got centroids {}", token.first, clusterModel.k)
+        log.info("{} got {} centroids", token.first, clusterModel.k)
 
         val centroids = clusterModel.centroids
 

diff --git a/src/main/kotlin/ru/itmo/stand/util/Concurrency.kt b/src/main/kotlin/ru/itmo/stand/util/Concurrency.kt
@@ -10,12 +10,11 @@ fun <T> processParallel(data: Sequence<T>, numWorkers: Int, log: Logger, action:
     data
         .onEachIndexed { index, _ -> if (index % 10 == 0) log.info("Elements processed: {}", index) }
         .chunked(numWorkers)
-        .mapIndexed { index, chunk ->
+        .forEach { chunk ->
             chunk.map {
                 launch {
                     action(it)
                 }
-            }
+            }.joinAll()
         }
-        .forEach { it.joinAll() }
 }
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
@@ -16,7 +16,6 @@ stand:
   elasticsearch.host-and-port: localhost:9200
   app:
     base-path: "."
-    file-load-buffer-size-kb: 512
     neighbours-algorithm:
       token-batch-size: 5
       bert-model-type: TINY

diff --git a/src/test/kotlin/ru/itmo/stand/fixtures/StandPropertiesBuilder.kt b/src/test/kotlin/ru/itmo/stand/fixtures/StandPropertiesBuilder.kt
@@ -12,13 +12,11 @@ fun standProperties(
     basePath: String = ".",
     bertMultiTokenBatchSize: Int = 5,
     neighboursAlgorithmBatchSize: Int = 5,
-    fileLoadBufferSizeMb: Int = 512,
 ) = StandProperties(
     ElasticsearchProperties(elkHostAndPort),
     ApplicationProperties(
         basePath,
         BertMultiToken(bertMultiTokenBatchSize),
         NeighboursAlgorithm(neighboursAlgorithmBatchSize, BertModelType.BASE, 500_000),
-        fileLoadBufferSizeMb,
     ),
 )