From 1f9bd250b77b2525d70d43eba8cce57da155a61b Mon Sep 17 00:00:00 2001
From: Govind Kamat <govkamat@amazon.com>
Date: Wed, 26 Apr 2023 00:05:58 -0700
Subject: [PATCH] Several improvements to data corpus expansion feature.

---
 scripts/expand-data-corpus.py | 127 +++++++++++++++++++---------------
 1 file changed, 73 insertions(+), 54 deletions(-)

diff --git a/scripts/expand-data-corpus.py b/scripts/expand-data-corpus.py
index 21466abf..cf69894a 100755
--- a/scripts/expand-data-corpus.py
+++ b/scripts/expand-data-corpus.py
@@ -9,21 +9,40 @@
 #
 # See the help message for more information.
 #
+
+import os
+import sys
+import signal
+import argparse
+import json
+import configparser
+
 help_msg = """
 
-NOTE: This is a beta feature.
+NOTE: This is a beta feature.  The user model, interface and options
+are subject to change.
 
 This tool is intended for the purpose of expanding the size of the
 data corpus associated an OSB workload.  Currently, this capability is
 implemented only for the http_logs workload.
 
+TLDR: to generate a 100 GB corpus and then run a test against it:
+
+$ expand-data-corpus.py --corpus-size 100 --output-file-suffix 100gb
+
+$ opensearch-benchmark execute_test --workload http_logs \\
+    --workload_params=generated_corpus:t ...
+
 The script generates new documents by duplicating ones in the existing
-corpus files, while modifying the timestamp field.  The script takes
-the starting timestamp and interval as arguments, in addition to the
-desired size of the target corpus.  This will lead to the fields other
-than the timestamp potentially recurring multiple times.  However, the
-efficacy of the queries should not be impacted if the script
-parameters are selected appropriately as indicated below.
+corpus files, while modifying the timestamp field.  It takes several
+arguments, listed below.  The two primary ones deal with specifying
+the desired target corpus size and the corpus name.  The remaining
+options, tagged with "EXPERT", are mainly intended for advanced users.
+
+Duplication of documents will lead to the fields other than the
+timestamp potentially recurring multiple times.  However, the efficacy
+of the queries should not be impacted if the script parameters are
+selected appropriately as indicated below.
 
 The larger data corpus would be suitable for running the workload at a
 larger scale on, for instance, clusters with multiple data nodes or
@@ -40,28 +59,23 @@
 The script can be invoked multiple times to create several corpora
 that will all be loaded concurrently.
 
-To be compliant with the time ranges used in the existing queries, the
-following guideline should be followed:
-
-  * Use an interval prarameter of -2 for every 1 GB corpus size.  This
-    will generate 20 documents per timestamp for a 10 GB size, for
-    instance:
-        expand-data-corpus.py \\
-	    --corpus-size 10 \\
-            --interval -20 \\
-            --output-file-suffix 100gb
-
 Prerequisites:
 
   * The data corpus files associated with http_logs need to already
     have been downloaded.  The easiest way to do this is to carry out
-    a normal run, perhaps indexing only.
+    a normal run, perhaps by limiting it to indexing-only.
 
   * The input file should be one of the data corpus files downloaded
     from the http_logs OSB workloads repository.  The script cues off
     the text alignment in those files.
 
-Limitations:
+Notes and limitations:
+
+  * This feature is currently available only for OpenSearch clusters
+    and Elasticsearch 7 cluster.
+
+  * The options tagged with "EXPERT" are intended for advanced users
+    and should not be needed in normal use.
 
   * There is currently no mechanism provided to manage the generated
     corpora files.  The workaround for now is to delete them manually
@@ -70,9 +84,15 @@
     subsequently, if desired.
 
   * OSB runs with and without the 'generated_corpus' flag should not
-    be interleaved, since they target different indices.  All
-    generated copora files should be removed prior to operating again
-    on the default copora files packaged with the http_logs workload.
+    generally be interleaved, since they target different
+    indices. However, OSB can be run in ingest-only mode to ingest
+    both the generated and default corpora in two separate runs.  Once
+    ingested, queries packaged with the workload will operate on the
+    entire loaded data set.
+
+  * To be compliant with the time ranges used in the existing queries,
+    the interval parameter is set to -2 for every 1 GB corpus size.
+    This generates ~20 documents per timestamp for a 10 GB size.
 
   * The script is not currently optimized for speed.  Allow for about
     ~30 min for every 100 GB.
@@ -83,15 +103,8 @@
 
 """
 
-import os
-import sys
-import signal
-import argparse
-import json
-import configparser
-
 def handler(signum, frame):
-    exit(1)
+    sys.exit(1)
 
 class DocGenerator:
 
@@ -125,19 +138,20 @@ def get_next_doc(self):
 
 
 class ArgParser(argparse.ArgumentParser):
-    def usg(self, message: str=None) -> None:
+    def usage_msg(self, message: str=None) -> None:
         if message:
             print(message, file=sys.stderr)
         print(file=sys.stderr)
         self.print_help(sys.stderr)
         sys.exit(1)
 
-    def error(self, err_msg: str) -> None:
-        sys.stderr.write('error: %s\n' % err_msg)
-        self.usg()
+    def error(self, message):
+        sys.stderr.write('error: %s\n' % message)
+        self.usage_msg()
 
 
 def generate_docs(workload: str,
+                  repository: str,
                   input_file: str,
                   output_file_suffix: str,
                   n_docs: int,
@@ -154,11 +168,12 @@ def generate_docs(workload: str,
     config.read(benchmark_home + '/.benchmark/benchmark.ini')
 
     root_dir = config['node']['root.dir']
-    workload_dir= root_dir + '/workloads/default/' + workload
+    workload_dir= root_dir + '/workloads/' + repository + '/' + workload
     data_dir = config['benchmarks']['local.dataset.cache'] + '/' + workload
 
     output_file = data_dir + '/documents-' + output_file_suffix + '.json'
-    input_file = data_dir + '/' + input_file
+    if '/' not in input_file:
+        input_file = data_dir + '/' + input_file
 
     out = open(output_file, 'w')
     offsets = open(output_file + '.offset', 'w')
@@ -227,52 +242,57 @@ def main(args: list) -> None:
     parser.add_argument('-w', '--workload',
                         default='http_logs',
                         help="workload name, default: %(default)s")
-    parser.add_argument('-f', '--input-file',
-                        default='documents-241998.json',
-                        help="input file name, default: %(default)s")
+    parser.add_argument('-r', '--workload-repository', default='default',
+                        help="workload name, default: %(default)s")
+    parser.add_argument('-c', '--corpus-size', type=int,
+                        help="size of corpus to generate in GB")
     parser.add_argument('-o', '--output-file-suffix',
                         default='generated',
                         help="suffix for output file name, "
                         "documents-SUFFIX.json, default: %(default)s")
+    parser.add_argument('-f', '--input-file',
+                        default='documents-241998.json',
+                        help="[EXPERT] input file name, default: %(default)s")
     parser.add_argument('-n', '--number-of-docs', type=int,
-                        help="number of documents to generate")
-    parser.add_argument('-c', '--corpus-size', type=int,
-                        help="size of corpus to generate in GB")
-    parser.add_argument('-i', '--interval', type=int, default=1,
-                        help="interval between consecutive timestamps, "
-        "use a negative number to specify multiple docs per timestamp, "
-        "default: %(default)d")
+                        help="[EXPERT] number of documents to generate")
+    parser.add_argument('-i', '--interval', type=int,
+                        help="[EXPERT] interval between consecutive "
+	        "timestamps, use a negative number to specify multiple "
+		"docs per timestamp")
     parser.add_argument('-t', '--start-timestamp', type=int,
                         default=893964618,
-                        help="start timestamp, default: %(default)d")
+                        help="[EXPERT] start timestamp, default: %(default)d")
     parser.add_argument('-b', '--batch-size', default=50000,
-                        help="batch size per OSB client thread, "
+                        help="[EXPERT] batch size per OSB client thread, "
                         "default: %(default)d")
 
     args = parser.parse_args()
 
     workload = args.workload
+    repository = args.workload_repository
     input_file = args.input_file
     output_file_suffix = args.output_file_suffix
     n_docs = args.number_of_docs
     corpus_size = args.corpus_size
-    interval = args.interval
+    interval = args.interval if args.interval is not None else \
+			corpus_size * -2
     start_timestamp = args.start_timestamp
     batch_size = args.batch_size
 
     if n_docs and corpus_size:
-        parser.usg(script_name +
+        parser.usage_msg(script_name +
                      ": can specify either number of documents"
                      "or corpus size, but not both")
     elif not n_docs and not corpus_size:
-        parser.usg(script_name +
+        parser.usage_msg(script_name +
                      ": must specify number of documents or corpus size")
 
     if workload != 'http_logs':
-        parser.usg(script_name +
+        parser.usage_msg(script_name +
                      ': only the "http_logs" workload is currently supported')
 
     generate_docs(workload,
+                  repository,
                   input_file,
                   output_file_suffix,
                   n_docs,
@@ -284,4 +304,3 @@ def main(args: list) -> None:
 
 if __name__ == '__main__':
     sys.exit(main(sys.argv[1:]))
-