Skip to content

Commit

Permalink
Several improvements to data corpus expansion feature.
Browse files Browse the repository at this point in the history
  • Loading branch information
gkamat committed Apr 26, 2023
1 parent df6e074 commit 1f9bd25
Showing 1 changed file with 73 additions and 54 deletions.
127 changes: 73 additions & 54 deletions scripts/expand-data-corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,40 @@
#
# See the help message for more information.
#

import os
import sys
import signal
import argparse
import json
import configparser

help_msg = """
NOTE: This is a beta feature.
NOTE: This is a beta feature. The user model, interface and options
are subject to change.
This tool is intended for the purpose of expanding the size of the
data corpus associated an OSB workload. Currently, this capability is
implemented only for the http_logs workload.
TLDR: to generate a 100 GB corpus and then run a test against it:
$ expand-data-corpus.py --corpus-size 100 --output-file-suffix 100gb
$ opensearch-benchmark execute_test --workload http_logs \\
--workload_params=generated_corpus:t ...
The script generates new documents by duplicating ones in the existing
corpus files, while modifying the timestamp field. The script takes
the starting timestamp and interval as arguments, in addition to the
desired size of the target corpus. This will lead to the fields other
than the timestamp potentially recurring multiple times. However, the
efficacy of the queries should not be impacted if the script
parameters are selected appropriately as indicated below.
corpus files, while modifying the timestamp field. It takes several
arguments, listed below. The two primary ones deal with specifying
the desired target corpus size and the corpus name. The remaining
options, tagged with "EXPERT", are mainly intended for advanced users.
Duplication of documents will lead to the fields other than the
timestamp potentially recurring multiple times. However, the efficacy
of the queries should not be impacted if the script parameters are
selected appropriately as indicated below.
The larger data corpus would be suitable for running the workload at a
larger scale on, for instance, clusters with multiple data nodes or
Expand All @@ -40,28 +59,23 @@
The script can be invoked multiple times to create several corpora
that will all be loaded concurrently.
To be compliant with the time ranges used in the existing queries, the
following guideline should be followed:
* Use an interval prarameter of -2 for every 1 GB corpus size. This
will generate 20 documents per timestamp for a 10 GB size, for
instance:
expand-data-corpus.py \\
--corpus-size 10 \\
--interval -20 \\
--output-file-suffix 100gb
Prerequisites:
* The data corpus files associated with http_logs need to already
have been downloaded. The easiest way to do this is to carry out
a normal run, perhaps indexing only.
a normal run, perhaps by limiting it to indexing-only.
* The input file should be one of the data corpus files downloaded
from the http_logs OSB workloads repository. The script cues off
the text alignment in those files.
Limitations:
Notes and limitations:
* This feature is currently available only for OpenSearch clusters
and Elasticsearch 7 cluster.
* The options tagged with "EXPERT" are intended for advanced users
and should not be needed in normal use.
* There is currently no mechanism provided to manage the generated
corpora files. The workaround for now is to delete them manually
Expand All @@ -70,9 +84,15 @@
subsequently, if desired.
* OSB runs with and without the 'generated_corpus' flag should not
be interleaved, since they target different indices. All
generated copora files should be removed prior to operating again
on the default copora files packaged with the http_logs workload.
generally be interleaved, since they target different
indices. However, OSB can be run in ingest-only mode to ingest
both the generated and default corpora in two separate runs. Once
ingested, queries packaged with the workload will operate on the
entire loaded data set.
* To be compliant with the time ranges used in the existing queries,
the interval parameter is set to -2 for every 1 GB corpus size.
This generates ~20 documents per timestamp for a 10 GB size.
* The script is not currently optimized for speed. Allow for about
~30 min for every 100 GB.
Expand All @@ -83,15 +103,8 @@
"""

import os
import sys
import signal
import argparse
import json
import configparser

def handler(signum, frame):
exit(1)
sys.exit(1)

class DocGenerator:

Expand Down Expand Up @@ -125,19 +138,20 @@ def get_next_doc(self):


class ArgParser(argparse.ArgumentParser):
def usg(self, message: str=None) -> None:
def usage_msg(self, message: str=None) -> None:
if message:
print(message, file=sys.stderr)
print(file=sys.stderr)
self.print_help(sys.stderr)
sys.exit(1)

def error(self, err_msg: str) -> None:
sys.stderr.write('error: %s\n' % err_msg)
self.usg()
def error(self, message):
sys.stderr.write('error: %s\n' % message)
self.usage_msg()


def generate_docs(workload: str,
repository: str,
input_file: str,
output_file_suffix: str,
n_docs: int,
Expand All @@ -154,11 +168,12 @@ def generate_docs(workload: str,
config.read(benchmark_home + '/.benchmark/benchmark.ini')

root_dir = config['node']['root.dir']
workload_dir= root_dir + '/workloads/default/' + workload
workload_dir= root_dir + '/workloads/' + repository + '/' + workload
data_dir = config['benchmarks']['local.dataset.cache'] + '/' + workload

output_file = data_dir + '/documents-' + output_file_suffix + '.json'
input_file = data_dir + '/' + input_file
if '/' not in input_file:
input_file = data_dir + '/' + input_file

out = open(output_file, 'w')
offsets = open(output_file + '.offset', 'w')
Expand Down Expand Up @@ -227,52 +242,57 @@ def main(args: list) -> None:
parser.add_argument('-w', '--workload',
default='http_logs',
help="workload name, default: %(default)s")
parser.add_argument('-f', '--input-file',
default='documents-241998.json',
help="input file name, default: %(default)s")
parser.add_argument('-r', '--workload-repository', default='default',
help="workload name, default: %(default)s")
parser.add_argument('-c', '--corpus-size', type=int,
help="size of corpus to generate in GB")
parser.add_argument('-o', '--output-file-suffix',
default='generated',
help="suffix for output file name, "
"documents-SUFFIX.json, default: %(default)s")
parser.add_argument('-f', '--input-file',
default='documents-241998.json',
help="[EXPERT] input file name, default: %(default)s")
parser.add_argument('-n', '--number-of-docs', type=int,
help="number of documents to generate")
parser.add_argument('-c', '--corpus-size', type=int,
help="size of corpus to generate in GB")
parser.add_argument('-i', '--interval', type=int, default=1,
help="interval between consecutive timestamps, "
"use a negative number to specify multiple docs per timestamp, "
"default: %(default)d")
help="[EXPERT] number of documents to generate")
parser.add_argument('-i', '--interval', type=int,
help="[EXPERT] interval between consecutive "
"timestamps, use a negative number to specify multiple "
"docs per timestamp")
parser.add_argument('-t', '--start-timestamp', type=int,
default=893964618,
help="start timestamp, default: %(default)d")
help="[EXPERT] start timestamp, default: %(default)d")
parser.add_argument('-b', '--batch-size', default=50000,
help="batch size per OSB client thread, "
help="[EXPERT] batch size per OSB client thread, "
"default: %(default)d")

args = parser.parse_args()

workload = args.workload
repository = args.workload_repository
input_file = args.input_file
output_file_suffix = args.output_file_suffix
n_docs = args.number_of_docs
corpus_size = args.corpus_size
interval = args.interval
interval = args.interval if args.interval is not None else \
corpus_size * -2
start_timestamp = args.start_timestamp
batch_size = args.batch_size

if n_docs and corpus_size:
parser.usg(script_name +
parser.usage_msg(script_name +
": can specify either number of documents"
"or corpus size, but not both")
elif not n_docs and not corpus_size:
parser.usg(script_name +
parser.usage_msg(script_name +
": must specify number of documents or corpus size")

if workload != 'http_logs':
parser.usg(script_name +
parser.usage_msg(script_name +
': only the "http_logs" workload is currently supported')

generate_docs(workload,
repository,
input_file,
output_file_suffix,
n_docs,
Expand All @@ -284,4 +304,3 @@ def main(args: list) -> None:

if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))

0 comments on commit 1f9bd25

Please sign in to comment.