LUCENE-9651 Update benchmark module docs (#759)

apache · Mar 23, 2022 · b3906e9 · b3906e9
1 parent 5450d72
commit b3906e9
Show file tree

Hide file tree

Showing 11 changed files with 28 additions and 29 deletions.
diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle
@@ -24,7 +24,7 @@ configure(project(":lucene:benchmark")) {
   apply plugin: "de.undercouch.download"
 
   ext {
-    dataDir = file("data")
+    dataDir = file("work")
   }
 
   task getEnWiki(type: Download) {
@@ -120,10 +120,9 @@ configure(project(":lucene:benchmark")) {
   task getReuters(type: Download) {
     ext {
       name = "reuters21578"
-      // note: there is no HTTPS url and we don't care because this is merely test/perf data
-      src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
+      src = "https://kdd.ics.uci.edu/databases/${name}/${name}.tar.gz"
       intermediate = file("${dataDir}/${name}.tar.gz")
-      dst = file("${dataDir}/${name}")
+      dst = file("${dataDir}/reuters-out")
     }
 
     outputs.dir ext.dst
@@ -171,4 +170,4 @@ configure(project(":lucene:benchmark")) {
       logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
     }
   }
-}
+}
diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle
@@ -108,6 +108,7 @@ allprojects {
 configure(project(':lucene:benchmark')) {
   project.tasks.withType(ValidateSourcePatternsTask) {
     sourceFiles.exclude 'data/**'
+    sourceFiles.exclude 'work/**'
 
     // Known .txt offenders.
     sourceFiles.exclude '**/reuters.first20.lines.txt', '**/trecQRels.txt'

diff --git a/lucene/benchmark/.gitignore b/lucene/benchmark/.gitignore
@@ -1 +1,2 @@
-/data
+/data
+/work
diff --git a/lucene/benchmark/conf/createLineFile.alg b/lucene/benchmark/conf/createLineFile.alg
@@ -20,9 +20,9 @@
 # This alg will process the Reuters documents feed to produce a
 # single file that contains all documents, one per line.
 #
-# To use this, first cd to benchmark and then run:
+# To use this run:
 #
-#   ant run-task -Dtask.alg=conf/createLineFile.alg
+#   gradlew :lucene:benchmark:run -Ptask.alg=conf/createLineFile.alg
 #
 # Then, to index the documents in the line file, see
 # indexLineFile.alg.

diff --git a/lucene/benchmark/conf/extractWikipedia.alg b/lucene/benchmark/conf/extractWikipedia.alg
@@ -20,9 +20,9 @@
 # This alg will process the Wikipedia documents feed to produce a
 # single file that contains all documents, one per line.
 #
-# To use this, first cd to benchmark and then run:
+# To use this run:
 #
-#   ant run-task -Dtask.alg=conf/extractWikipedia.alg
+#   gradlew :lucene:benchmark:run -Ptask.alg=conf/extractWikipedia.alg
 #
 # Then, to index the documents in the line file, see
 # indexLineFile.alg.

diff --git a/lucene/benchmark/conf/indexLineFile.alg b/lucene/benchmark/conf/indexLineFile.alg
@@ -23,10 +23,9 @@
 # document to let you more accurately measure time spent analyzing and
 # indexing your documents vs time spent creating the documents.
 #
-# To use this, you must first run the createLineFile.alg, then cd to
-# benchmark and then run:
+# To use this, you must first run the createLineFile.alg, then run:
 #
-#   ant run-task -Dtask.alg=conf/indexLineFile.alg
+#   gradlew :lucene:benchmark:run -Ptask.alg=conf/indexLineFile.alg
 #
 
 analyzer=org.apache.lucene.analysis.core.SimpleAnalyzer

diff --git a/lucene/benchmark/conf/micro-standard.alg b/lucene/benchmark/conf/micro-standard.alg
@@ -30,8 +30,7 @@ doc.tokenized=true
 doc.term.vector=false
 log.step=500
 
-work.dir=data
-docs.dir=reuters21578
+docs.dir=reuters-out
 
 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource

diff --git a/lucene/benchmark/conf/readContentSource.alg b/lucene/benchmark/conf/readContentSource.alg
@@ -22,9 +22,9 @@
 # gather baselines for operations like indexing (if reading from the content 
 # source takes 'X' time, we cannot index faster).
 #
-# To use this, first cd to benchmark and then run:
+# To use this run:
 #
-#   ant run-task -Dtask.alg=conf/readContentSource.alg
+#   gradlew :lucene:benchmark:run -Ptask.alg=conf/readContentSource.alg
 #
 
 # Where to get documents from:

diff --git a/lucene/benchmark/conf/tokenize.alg b/lucene/benchmark/conf/tokenize.alg
@@ -20,9 +20,9 @@
 # This alg reads all tokens out of a document but does not index them.
 # This is useful for benchmarking tokenizers.
 #
-# To use this, cd to benchmark and then run:
+# To use this run:
 #
-#   ant run-task -Dtask.alg=conf/tokenize.alg
+#   gradlew :lucene:benchmark:run -Ptask.alg=conf/tokenize.alg
 #
 
 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource

diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/package-info.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/package-info.java
@@ -81,9 +81,9 @@
  * <ul>
  *   <li>./gradlew -p lucene/benchmark getReuters run <br>
  *       - would run the <code>micro-standard.alg</code> "algorithm".
- *   <li>ant run-task -Dtask.alg=conf/compound-penalty.alg <br>
+ *   <li>./gradlew -p lucene/benchmark getReuters run -Ptask.alg=conf/compound-penalty.alg <br>
  *       - would run the <code>compound-penalty.alg</code> "algorithm".
- *   <li>ant run-task -Dtask.alg=[full-path-to-your-alg-file] <br>
+ *   <li>./gradlew -p lucene/benchmark getReuters run -Ptask.alg=[full-path-to-your-alg-file] <br>
  *       - would run <code>your perf test</code> "algorithm".
  *   <li>java org.apache.lucene.benchmark.byTask.programmatic.Sample <br>
  *       - would run a performance test programmatically - without using an alg file. This is less
@@ -131,8 +131,8 @@
  * benchmark.ext.classpath property:
  *
  * <ul>
- *   <li>ant run-task -Dtask.alg=[full-path-to-your-alg-file] <span style="color:
- *       #FF0000">-Dbenchmark.ext.classpath=/mydir/classes </span> -Dtask.mem=512M
+ *   <li>./gradlew -p lucene/benchmark run -Ptask.alg=[full-path-to-your-alg-file] <span
+ *       style="color: #FF0000">-Dbenchmark.ext.classpath=/mydir/classes </span> -Dtask.mem=512M
  * </ul>
  *
  * <p><u>External tasks</u>: When writing your own tasks under a package other than
@@ -494,7 +494,7 @@
  * </pre>
  *
  * <p>The command line for running this sample: <br>
- * <code>ant run-task -Dtask.alg=conf/sample.alg</code>
+ * <code>./gradlew -p lucene/benchmark getReuters run -Ptask.alg=conf/sample.alg</code>
  *
  * <p>The output report from running this test contains the following:
  *

diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/package-info.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/package-info.java
@@ -35,12 +35,12 @@
  * alternate views or to take in command line options. When reporting benchmarking runs you should
  * state any alterations you have made.
  *
- * <p>To run the short version of the StandardBenchmarker, call "ant run-micro-standard". This
- * should take a minute or so to complete and give you a preliminary idea of how your change affects
- * the code.
+ * <p>To run the short version of the StandardBenchmarker, call "./gradlew -p lucene/benchmark run".
+ * This should take a minute or so to complete and give you a preliminary idea of how your change
+ * affects the code.
  *
- * <p>To run the long version of the StandardBenchmarker, call "ant run-standard". This takes
- * considerably longer.
+ * <p>To run the long version of the StandardBenchmarker, call "./gradlew -p lucene/benchmark run
+ * -PtaskAlg=conf/standard.alg". This takes considerably longer, maybe 10 minutes.
  *
  * <p>The original code for these classes was donated by Andrzej Bialecki at
  * http://issues.apache.org/jira/browse/LUCENE-675 and has been updated by Grant Ingersoll to make
-Original file line number
+Diff line change
@@ -1 +1,2 @@
-    /data
+    /data
+    /work