Skip to content

Commit

Permalink
Remote benchmark tweaks (#741)
Browse files Browse the repository at this point in the history
* Remote benchmark tweaks

* refactor

---------

Co-authored-by: Sebastián Galkin <code@amisdelabc.com>
  • Loading branch information
dcherian and paraseba authored Feb 17, 2025
1 parent 0fc57df commit 6e973a3
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 59 deletions.
7 changes: 7 additions & 0 deletions icechunk-python/benchmarks/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ def synth_dataset(request) -> Store:
ds.storage_config = ds.storage_config.with_overwrite(
**TEST_BUCKETS[where]
).with_extra(prefix=extra_prefix, force_idempotent=True)
if ds.setupfn is None:
# these datasets aren't automatically set up
# so skip if the data haven't been written yet.
try:
ds.store()
except ValueError as e:
pytest.skip(reason=str(e))
return ds


Expand Down
4 changes: 2 additions & 2 deletions icechunk-python/benchmarks/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ def assert_cwd_is_icechunk_python():
)


def get_commit(ref: str) -> str:
def get_full_commit(ref: str) -> str:
return subprocess.run(
["git", "rev-parse", ref], capture_output=True, text=True, check=True
).stdout.strip()[:8]
).stdout.strip()


def rdms() -> str:
Expand Down
4 changes: 4 additions & 0 deletions icechunk-python/benchmarks/most_recent.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env sh

echo $(ls -t ./.benchmarks/**/* | head -n 1)
pytest-benchmark compare --group=group,func,param --sort=fullname --columns=median --name=normal `ls -t ./.benchmarks/**/* | head -n 1`
121 changes: 64 additions & 57 deletions icechunk-python/benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@
from helpers import (
assert_cwd_is_icechunk_python,
get_coiled_kwargs,
get_commit,
get_full_commit,
setup_logger,
)

import icechunk as ic

logger = setup_logger()

PIP_OPTIONS = "--disable-pip-version-check -q"
Expand Down Expand Up @@ -54,21 +52,22 @@ class Runner:

def __init__(self, *, ref: str, where: str) -> None:
self.ref = ref
self.commit = get_commit(ref)
self.full_commit = get_full_commit(ref)
self.commit = self.full_commit[:8]
self.where = where

@property
def pip_github_url(self) -> str:
# optional extras cannot be specified here, "not guaranteed to work"
# https://pip.pypa.io/en/stable/topics/vcs-support/#url-fragments
return f"git+https://github.com/earth-mover/icechunk.git@{self.commit}#subdirectory=icechunk-python"
return f"git+https://github.com/earth-mover/icechunk.git@{self.full_commit}#subdirectory=icechunk-python"

@property
def prefix(self) -> str:
try:
return f"v{ic.spec_version():02d}"
except AttributeError:
return f"{self.ref}_{self.commit}"
# try:
# return f"v{ic.spec_version():02d}"
# except AttributeError:
return f"{self.ref}_{self.commit}"

@property
def ref_commit(self) -> str:
Expand All @@ -84,12 +83,11 @@ def execute(cmd: str) -> None:

def initialize(self) -> None:
"""Builds virtual envs etc."""
raise NotImplementedError
self.sync_benchmarks_folder()

def setup(self, *, force: bool):
"""Creates datasets for read benchmarks."""
logger.info(f"setup_benchmarks for {self.ref} / {self.commit}")
self.sync_benchmarks_folder()
cmd = (
f"pytest {PYTEST_OPTIONS} -nauto "
f"-m setup_benchmarks --force-setup={force} "
Expand All @@ -104,23 +102,21 @@ def run(self, *, pytest_extra: str = "") -> None:
"""Actually runs the benchmarks."""
logger.info(f"running benchmarks for {self.ref} / {self.commit}")

self.sync_benchmarks_folder()

# shorten the name so `pytest-benchmark compare` is readable
clean_ref = self.ref.removeprefix("icechunk-v0.1.0-alph")

assert self.bench_store_dir is not None
# Note: .benchmarks is the default location for pytest-benchmark
cmd = (
f"pytest {PYTEST_OPTIONS} "
f"pytest {pytest_extra} "
f"--benchmark-storage={self.bench_store_dir}/.benchmarks "
f"--benchmark-save={clean_ref}_{self.commit}_{self.where} "
f"--where={self.where} "
f"--icechunk-prefix=benchmarks/{self.prefix}/ "
f"{pytest_extra} "
f"{PYTEST_OPTIONS} "
"benchmarks/"
)
logger.info(cmd)
print(cmd)

self.execute(cmd, check=False)

Expand Down Expand Up @@ -151,11 +147,49 @@ def initialize(self) -> None:
subprocess.run(["python3", "-m", "venv", ".venv"], cwd=self.pycwd, check=True)
cmd = f"pip install {PIP_OPTIONS} {self.pip_github_url} {deps}"
self.execute(cmd, check=True)
super().initialize()

def run(self, *, pytest_extra: str = "") -> None:
super().run(pytest_extra=pytest_extra)
if len(refs) > 1:
files = sorted(
glob.glob("./.benchmarks/**/*.json", recursive=True),
key=os.path.getmtime,
reverse=True,
)[-len(refs) :]
# TODO: Use `just` here when we figure that out.
subprocess.run(
[
"pytest-benchmark",
"compare",
"--group=group,func,param",
"--sort=fullname",
"--columns=median",
"--name=normal",
*files,
]
)


class CoiledRunner(Runner):
bench_store_dir = "."

def get_coiled_run_args(self) -> tuple[str]:
ckwargs = self.get_coiled_kwargs()
return (
"coiled",
"run",
"--interactive",
"--name",
f"icebench-{self.commit}", # cluster name
"--keepalive",
"10m",
f"--workspace={ckwargs['workspace']}", # cloud
f"--vm-type={ckwargs['vm_type']}",
f"--software={ckwargs['software']}",
f"--region={ckwargs['region']}",
)

def get_coiled_kwargs(self):
COILED_SOFTWARE = {
"icechunk-v0.1.0-alpha.1": "icechunk-alpha-release",
Expand Down Expand Up @@ -185,34 +219,26 @@ def initialize(self) -> None:
},
pip=[self.pip_github_url, "coiled", *deps],
)
super().initialize()

def execute(self, cmd, **kwargs) -> None:
ckwargs = self.get_coiled_kwargs()
ls = [f for f in os.listdir(CURRENTDIR) if f not in [".benchmarks", "benchmarks"]]
toignore = " ".join(ls)
subprocess.run([*self.get_coiled_run_args(), cmd], **kwargs)

def sync_benchmarks_folder(self) -> None:
subprocess.run(
[
"coiled",
"run",
"--interactive",
"--name",
f"icebench-{self.commit}", # cluster name
"--sync",
f"--sync-ignore={toignore!r}",
"--keepalive",
"10m",
f"--workspace={ckwargs['workspace']}", # cloud
f"--vm-type={ckwargs['vm_type']}",
f"--software={ckwargs['software']}",
f"--region={ckwargs['region']}",
cmd,
*self.get_coiled_run_args(),
"--file",
"benchmarks/",
"ls -alh ./.benchmarks/",
],
**kwargs,
check=True,
)

def sync_benchmarks_folder(self) -> None:
# uses command-line --sync option
pass
def run(self, *, pytest_extra: str = "") -> None:
super().run(pytest_extra=pytest_extra)
# This prints to screen but we could upload to a bucket in here.
self.execute("sh benchmarks/most_recent.sh")


def init_for_ref(runner: Runner):
Expand All @@ -223,7 +249,7 @@ def init_for_ref(runner: Runner):
parser = argparse.ArgumentParser()
parser.add_argument("refs", help="refs to run benchmarks for", nargs="+")
parser.add_argument("--pytest", help="passed to pytest", default="")
parser.add_argument("--where", help="where to run? [local]", default="local")
parser.add_argument("--where", help="where to run? [local|s3|gcs]", default="local")
parser.add_argument(
"--skip-setup",
help="skip setup step, useful for benchmarks that don't need data",
Expand Down Expand Up @@ -254,25 +280,6 @@ def init_for_ref(runner: Runner):
for runner in tqdm.tqdm(runners):
runner.run(pytest_extra=args.pytest)

if len(refs) > 1:
files = sorted(
glob.glob("./.benchmarks/**/*.json", recursive=True),
key=os.path.getmtime,
reverse=True,
)[-len(refs) :]
# TODO: Use `just` here when we figure that out.
subprocess.run(
[
"pytest-benchmark",
"compare",
"--group=group,func,param",
"--sort=fullname",
"--columns=median",
"--name=normal",
*files,
]
)


# Compare wish-list:
# 1. skip differences < X%
Expand Down

0 comments on commit 6e973a3

Please sign in to comment.