From f32ae9cd31af7cab11503a0254b24d72857f8fda Mon Sep 17 00:00:00 2001 From: David 'Digit' Turner Date: Fri, 13 Sep 2024 13:39:47 +0200 Subject: [PATCH] Support jobserver client mode automatically. Detect that the environment variable MAKEFLAGS specifies a jobserver pool to use, and automatically use it to control build parallelism when this is the case. This is disabled is `--dry-run` or an explicit `-j` is passed on the command-line. Note that the `-l` option used to limit dispatch based on the overall load factor will still be in effect if used. + Use default member initialization for BuildConfig struct. + Add a new regression test suite that uses the misc/jobserver_pool.py script that was introduced in a previous commit, to verify that everything works properly. --- .github/workflows/linux.yml | 3 + doc/manual.asciidoc | 38 ++++- misc/jobserver_test.py | 255 ++++++++++++++++++++++++++++++++++ misc/jobserver_test_helper.py | 42 ++++++ src/build.cc | 17 ++- src/build.h | 28 ++-- src/graph.h | 4 + src/ninja.cc | 73 +++++++++- src/real_command_runner.cc | 20 ++- 9 files changed, 463 insertions(+), 17 deletions(-) create mode 100755 misc/jobserver_test.py create mode 100755 misc/jobserver_test_helper.py diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 07431bf797..a0cffcfe60 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -27,6 +27,7 @@ jobs: run: | ./ninja_test --gtest_color=yes ../../misc/output_test.py + ../../misc/jobserver_test.py - name: Build release ninja run: CLICOLOR_FORCE=1 ninja -f build-Release.ninja working-directory: build @@ -35,6 +36,7 @@ jobs: run: | ./ninja_test --gtest_color=yes ../../misc/output_test.py + ../../misc/jobserver_test.py build: runs-on: [ubuntu-latest] @@ -166,6 +168,7 @@ jobs: ./ninja all python3 misc/ninja_syntax_test.py ./misc/output_test.py + ./misc/jobserver_test.py build-aarch64: name: Build Linux ARM64 diff --git a/doc/manual.asciidoc b/doc/manual.asciidoc index 1e9ede9891..1e5ae64b4b 100644 --- a/doc/manual.asciidoc +++ b/doc/manual.asciidoc @@ -187,10 +187,42 @@ Ninja defaults to running commands in parallel anyway, so typically you don't need to pass `-j`.) +GNU Jobserver support +~~~~~~~~~~~~~~~~~~~~~ + +Since version 1.13., Ninja builds can follow the +https://https://www.gnu.org/software/make/manual/html_node/Job-Slots.html[GNU Make jobserver] +client protocol. This is useful when Ninja is invoked as part of a larger +build system controlled by a top-level GNU Make instance, or any other +jobserver pool implementation, as it allows better coordination between +concurrent build tasks. + +This feature is automatically enabled under the following conditions: + +- Dry-run (i.e. `-n` or `--dry-run`) is not enabled. + +- No explicit job count (e.g. `-j`) is passed on the command + line. + +- The `MAKEFLAGS` environment variable is defined and describes a valid + jobserver mode using `--jobserver-auth` or even `--jobserver-fds`. + +In this case, Ninja will use the jobserver pool of job slots to control +parallelism, instead of its default implementation of `-j`. + +Note that load-average limitations (i.e. when using `-l`) +are still being enforced in this mode. + +On Posix, Ninja supports both the `pipe` and `fifo` client modes, based on +the content of `MAKEFLAGS`. + +IMPORTANT: A warning will be printed when `pipe` mode is detected, as this +mode can be less reliable than `fifo`. + Environment variables ~~~~~~~~~~~~~~~~~~~~~ -Ninja supports one environment variable to control its behavior: +Ninja supports a few environment variables to control its behavior: `NINJA_STATUS`, the progress status printed before the rule being run. Several placeholders are available: @@ -215,6 +247,10 @@ The default progress status is `"[%f/%t] "` (note the trailing space to separate from the build rule). Another example of possible progress status could be `"[%u/%r/%f] "`. +If `MAKEFLAGS` is defined in the environment, if may alter how +Ninja dispatches parallel build commands. See the GNU Jobserver support +section for details. + Extra tools ~~~~~~~~~~~ diff --git a/misc/jobserver_test.py b/misc/jobserver_test.py new file mode 100755 index 0000000000..510c2e0e14 --- /dev/null +++ b/misc/jobserver_test.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +# Copyright 2024 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from textwrap import dedent +import os +import platform +import subprocess +import tempfile +import typing as T +import sys +import unittest + +_SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__)) +_JOBSERVER_POOL_SCRIPT = os.path.join(_SCRIPT_DIR, "jobserver_pool.py") +_JOBSERVER_TEST_HELPER_SCRIPT = os.path.join(_SCRIPT_DIR, "jobserver_test_helper.py") + +_PLATFORM_IS_WINDOWS = platform.system() == "Windows" + +default_env = dict(os.environ) +default_env.pop("NINJA_STATUS", None) +default_env.pop("MAKEFLAGS", None) +default_env["TERM"] = "dumb" +NINJA_PATH = os.path.abspath("./ninja") + + +class BuildDir: + def __init__(self, build_ninja: str): + self.build_ninja = dedent(build_ninja) + self.d: T.Optional[tempfile.TemporaryDirectory] = None + + def __enter__(self): + self.d = tempfile.TemporaryDirectory() + with open(os.path.join(self.d.name, "build.ninja"), "w") as f: + f.write(self.build_ninja) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.d.cleanup() + + @property + def path(self) -> str: + assert self.d + return self.d.name + + def run( + self, + cmd_flags: T.Sequence[str] = [], + env: T.Dict[str, str] = default_env, + ) -> None: + """Run a command, raise exception on error. Do not capture outputs.""" + ret = subprocess.run(cmd_flags, env=env) + ret.check_returncode() + + def ninja_run( + self, + ninja_args: T.List[str], + prefix_args: T.List[str] = [], + extra_env: T.Dict[str, str] = {}, + ) -> "subprocess.CompletedProcess[str]": + ret = self.ninja_spawn( + ninja_args, + prefix_args=prefix_args, + extra_env=extra_env, + capture_output=False, + ) + ret.check_returncode() + return ret + + def ninja_clean(self) -> None: + self.ninja_run(["-t", "clean"]) + + def ninja_spawn( + self, + ninja_args: T.List[str], + prefix_args: T.List[str] = [], + extra_env: T.Dict[str, str] = {}, + capture_output: bool = True, + ) -> "subprocess.CompletedProcess[str]": + """Run Ninja command and capture outputs.""" + return subprocess.run( + prefix_args + [NINJA_PATH, "-C", self.path] + ninja_args, + text=True, + capture_output=capture_output, + env={**default_env, **extra_env}, + ) + + +def span_output_file(span_n: int) -> str: + return "out%02d" % span_n + + +def generate_build_plan(command_count: int) -> str: + """Generate a Ninja build plan for |command_count| parallel tasks. + + Each task calls the test helper script which waits for 50ms + then writes its own start and end time to its output file. + """ + result = f""" +rule span + command = {sys.executable} -S {_JOBSERVER_TEST_HELPER_SCRIPT} --duration-ms=50 $out + +""" + + for n in range(command_count): + result += "build %s: span\n" % span_output_file(n) + + result += "build all: phony %s\n" % " ".join( + [span_output_file(n) for n in range(command_count)] + ) + return result + + +def compute_max_overlapped_spans(build_dir: str, command_count: int) -> int: + """Compute the maximum number of overlapped spanned tasks. + + This reads the output files from |build_dir| and look at their start and end times + to compute the maximum number of tasks that were run in parallel. + """ + # Read the output files. + if command_count < 2: + return 0 + + spans: T.List[T.Tuple[int, int]] = [] + for n in range(command_count): + with open(os.path.join(build_dir, span_output_file(n)), "rb") as f: + content = f.read().decode("utf-8") + lines = content.splitlines() + assert len(lines) == 2, f"Unexpected output file content: [{content}]" + spans.append((int(lines[0]), int(lines[1]))) + + # Stupid but simple, for each span, count the number of other spans that overlap it. + max_overlaps = 1 + for n in range(command_count): + cur_start, cur_end = spans[n] + cur_overlaps = 1 + for m in range(command_count): + other_start, other_end = spans[m] + if n != m and other_end > cur_start and other_start < cur_end: + cur_overlaps += 1 + + if cur_overlaps > max_overlaps: + max_overlaps = cur_overlaps + + return max_overlaps + + +class JobserverTest(unittest.TestCase): + + def test_no_jobserver_client(self): + task_count = 4 + build_plan = generate_build_plan(task_count) + with BuildDir(build_plan) as b: + output = b.run([NINJA_PATH, "-C", b.path, f"-j{task_count}", "all"]) + + max_overlaps = compute_max_overlapped_spans(b.path, task_count) + self.assertEqual(max_overlaps, task_count) + + b.ninja_clean() + output = b.run([NINJA_PATH, "-C", b.path, "-j1", "all"]) + + max_overlaps = compute_max_overlapped_spans(b.path, task_count) + self.assertEqual(max_overlaps, 1) + + def _run_client_test(self, jobserver_args: T.List[str]) -> None: + task_count = 4 + build_plan = generate_build_plan(task_count) + with BuildDir(build_plan) as b: + # First, run the full 10 tasks with with 10 tokens, this should allow all + # tasks to run in parallel. + ret = b.ninja_run( + ninja_args=["all"], + prefix_args=jobserver_args + [f"--jobs={task_count}"], + ) + max_overlaps = compute_max_overlapped_spans(b.path, task_count) + self.assertEqual(max_overlaps, task_count) + + # Second, use 2 tokens only, and verify that this was enforced by Ninja. + b.ninja_clean() + b.ninja_run( + ["all"], + prefix_args=jobserver_args + ["--jobs=2"], + ) + max_overlaps = compute_max_overlapped_spans(b.path, task_count) + self.assertEqual(max_overlaps, 2) + + # Third, verify that --jobs=1 serializes all tasks. + b.ninja_clean() + b.ninja_run( + ["all"], + prefix_args=jobserver_args + ["--jobs=1"], + ) + max_overlaps = compute_max_overlapped_spans(b.path, task_count) + self.assertEqual(max_overlaps, 1) + + # Finally, verify that -j1 overrides the pool. + b.ninja_clean() + b.ninja_run( + ["-j1", "all"], + prefix_args=jobserver_args + [f"--jobs={task_count}"], + ) + max_overlaps = compute_max_overlapped_spans(b.path, task_count) + self.assertEqual(max_overlaps, 1) + + @unittest.skipIf(_PLATFORM_IS_WINDOWS, "These test methods do not work on Windows") + def test_jobserver_client_with_posix_pipe(self): + self._run_client_test([sys.executable, "-S", _JOBSERVER_POOL_SCRIPT, "--pipe"]) + + @unittest.skipIf(_PLATFORM_IS_WINDOWS, "These test methods do not work on Windows") + def test_jobserver_client_with_posix_fifo(self): + self._run_client_test([sys.executable, "-S", _JOBSERVER_POOL_SCRIPT]) + + def _test_MAKEFLAGS_value( + self, ninja_args: T.List[str] = [], prefix_args: T.List[str] = [] + ): + build_plan = r""" +rule print + command = echo MAKEFLAGS="[$$MAKEFLAGS]" + +build all: print +""" + with BuildDir(build_plan) as b: + ret = b.ninja_spawn( + ninja_args + ["--quiet", "all"], prefix_args=prefix_args + ) + self.assertEqual(ret.returncode, 0) + output = ret.stdout.strip() + pos = output.find("MAKEFLAGS=[") + self.assertNotEqual(pos, -1, "Could not find MAKEFLAGS in output!") + makeflags, sep, _ = output[pos + len("MAKEFLAGS=[") :].partition("]") + self.assertEqual(sep, "]", "Missing ] in output!: " + output) + self.assertTrue( + "--jobserver-auth=" in makeflags, + f"Missing --jobserver-auth from MAKEFLAGS [{makeflags}]\nSTDOUT [{ret.stdout}]\nSTDERR [{ret.stderr}]", + ) + + def test_client_passes_MAKEFLAGS(self): + self._test_MAKEFLAGS_value( + prefix_args=[sys.executable, "-S", _JOBSERVER_POOL_SCRIPT] + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/misc/jobserver_test_helper.py b/misc/jobserver_test_helper.py new file mode 100755 index 0000000000..8c238624af --- /dev/null +++ b/misc/jobserver_test_helper.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# Copyright 2024 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Simple utility used by the jobserver test. Wait for specific time, then write start/stop times to output file.""" + +import argparse +import time +import sys +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--duration-ms", + default="50", + help="sleep duration in milliseconds (default 50)", + ) + parser.add_argument("output_file", type=Path, help="output file name.") + args = parser.parse_args() + + now_time_ns = time.time_ns() + time.sleep(int(args.duration_ms) / 1000.0) + args.output_file.write_text(f"{now_time_ns}\n{time.time_ns()}\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/build.cc b/src/build.cc index d256d940b0..b71b5ac473 100644 --- a/src/build.cc +++ b/src/build.cc @@ -36,6 +36,7 @@ #include "disk_interface.h" #include "explanations.h" #include "graph.h" +#include "jobserver.h" #include "metrics.h" #include "state.h" #include "status.h" @@ -163,6 +164,15 @@ Edge* Plan::FindWork() { return NULL; Edge* work = ready_.top(); + + // If jobserver mode is enabled, try to acquire a token first, + // and return null in case of failure. + if (builder_ && builder_->jobserver_) { + work->job_slot_ = builder_->jobserver_->TryAcquire(); + if (!work->job_slot_.IsValid()) + return nullptr; + } + ready_.pop(); return work; } @@ -199,6 +209,10 @@ bool Plan::EdgeFinished(Edge* edge, EdgeResult result, string* err) { edge->pool()->EdgeFinished(*edge); edge->pool()->RetrieveReadyEdges(&ready_); + // Release job slot if needed. + if (builder_ && builder_->jobserver_) + builder_->jobserver_->Release(std::move(edge->job_slot_)); + // The rest of this function only applies to successful commands. if (result != kEdgeSucceeded) return true; @@ -699,7 +713,8 @@ bool Builder::Build(string* err) { if (config_.dry_run) command_runner_.reset(new DryRunCommandRunner); else - command_runner_.reset(CommandRunner::factory(config_)); + command_runner_.reset(CommandRunner::factory(config_, jobserver_)); + ; } // We are about to start the build process. diff --git a/src/build.h b/src/build.h index ba39e7728a..5d14820096 100644 --- a/src/build.h +++ b/src/build.h @@ -24,6 +24,7 @@ #include "depfile_parser.h" #include "exit_status.h" #include "graph.h" +#include "jobserver.h" #include "util.h" // int64_t struct BuildLog; @@ -165,14 +166,15 @@ struct CommandRunner { virtual std::vector GetActiveEdges() { return std::vector(); } virtual void Abort() {} - /// Creates the RealCommandRunner - static CommandRunner* factory(const BuildConfig& config); + /// Creates the RealCommandRunner. \arg jobserver can be nullptr if there + /// is no jobserver pool to use. + static CommandRunner* factory(const BuildConfig& config, + Jobserver::Client* jobserver); }; /// Options (e.g. verbosity, parallelism) passed to a build. struct BuildConfig { - BuildConfig() : verbosity(NORMAL), dry_run(false), parallelism(1), - failures_allowed(1), max_load_average(-0.0f) {} + BuildConfig() = default; enum Verbosity { QUIET, // No output -- used when testing. @@ -180,13 +182,14 @@ struct BuildConfig { NORMAL, // regular output and status update VERBOSE }; - Verbosity verbosity; - bool dry_run; - int parallelism; - int failures_allowed; + Verbosity verbosity = NORMAL; + bool dry_run = false; + int parallelism = 1; + bool disable_jobserver_client = false; + int failures_allowed = 1; /// The maximum load average we must not exceed. A negative value /// means that we do not have any limit. - double max_load_average; + double max_load_average = -0.0f; DepfileParserOptions depfile_parser_options; }; @@ -197,6 +200,12 @@ struct Builder { int64_t start_time_millis); ~Builder(); + /// Set Jobserver client instance for this builder. + void SetJobserverClient(Jobserver::Client* jobserver_client) { + jobserver_ = jobserver_client; + ; + } + /// Clean up after interrupted commands by deleting output files. void Cleanup(); @@ -230,6 +239,7 @@ struct Builder { State* state_; const BuildConfig& config_; Plan plan_; + Jobserver::Client* jobserver_ = nullptr; std::unique_ptr command_runner_; Status* status_; diff --git a/src/graph.h b/src/graph.h index 806260e5d7..d98f1f9440 100644 --- a/src/graph.h +++ b/src/graph.h @@ -24,6 +24,7 @@ #include "dyndep.h" #include "eval_env.h" #include "explanations.h" +#include "jobserver.h" #include "timestamp.h" #include "util.h" @@ -263,6 +264,9 @@ struct Edge { bool use_console() const; bool maybe_phonycycle_diagnostic() const; + /// A Jobserver slot instance. Invalid by default. + Jobserver::Slot job_slot_; + // Historical info: how long did this edge take last time, // as per .ninja_log, if known? Defaults to -1 if unknown. int64_t prev_elapsed_time_millis = -1; diff --git a/src/ninja.cc b/src/ninja.cc index f681bfec11..dcbb2f3eab 100644 --- a/src/ninja.cc +++ b/src/ninja.cc @@ -38,14 +38,15 @@ #include "browse.h" #include "build.h" #include "build_log.h" -#include "deps_log.h" #include "clean.h" #include "command_collector.h" #include "debug_flags.h" #include "depfile_parser.h" +#include "deps_log.h" #include "disk_interface.h" #include "graph.h" #include "graphviz.h" +#include "jobserver.h" #include "json.h" #include "manifest_parser.h" #include "metrics.h" @@ -163,6 +164,10 @@ struct NinjaMain : public BuildLogUser { /// and record that in the edge itself. It will be used for ETA prediction. void ParsePreviousElapsedTimes(); + /// Create a jobserver client if needed. Return a nullptr value if + /// not. Prints info and warnings to \a status. + std::unique_ptr SetupJobserverClient(Status* status); + /// Build the targets listed on the command line. /// @return an exit code. int RunBuild(int argc, char** argv, Status* status); @@ -1541,9 +1546,60 @@ bool NinjaMain::EnsureBuildDirExists() { return true; } +std::unique_ptr NinjaMain::SetupJobserverClient( + Status* status) { + // Empty result by default. + std::unique_ptr result; + + // If dry-run or explicit job count, don't even look at MAKEFLAGS + if (config_.disable_jobserver_client) + return result; + + const char* makeflags = getenv("MAKEFLAGS"); + if (!makeflags) { + // MAKEFLAGS is not defined. + return result; + } + + std::string err; + Jobserver::Config jobserver_config; + if (!Jobserver::ParseNativeMakeFlagsValue(makeflags, &jobserver_config, + &err)) { + // MAKEFLAGS is defined but could not be parsed correctly. + if (config_.verbosity > BuildConfig::QUIET) + status->Warning("Unsupported MAKEFLAGS value: %s [%s]", err.c_str(), + makeflags); + return result; + } + + if (!jobserver_config.HasMode()) { + // MAKEFLAGS is defined, but does not describe a jobserver mode. + return result; + } + + if (config_.verbosity > BuildConfig::NO_STATUS_UPDATE) { + status->Info("Jobserver mode detected: %s", makeflags); +#ifndef _WIN32 + if (jobserver_config.mode == Jobserver::Config::kModePipe) { + status->Warning( + "Jobserver 'pipe' mode detected, a pool that implements 'fifo' mode " + "would be more reliable!"); + } +#endif + } + + result = Jobserver::Client::Create(jobserver_config, &err); + if (!result.get()) { + // Jobserver client initialization failed !? + if (config_.verbosity > BuildConfig::QUIET) + status->Error("Could not initialize jobserver: %s", err.c_str()); + } + return result; +} + int NinjaMain::RunBuild(int argc, char** argv, Status* status) { - string err; - vector targets; + std::string err; + std::vector targets; if (!CollectTargetsFromArgs(argc, argv, &targets, &err)) { status->Error("%s", err.c_str()); return 1; @@ -1553,6 +1609,15 @@ int NinjaMain::RunBuild(int argc, char** argv, Status* status) { Builder builder(&state_, config_, &build_log_, &deps_log_, &disk_interface_, status, start_time_millis_); + + // Detect jobserver context and inject Jobserver::Client into the builder + // if needed. + std::unique_ptr jobserver_client = + SetupJobserverClient(status); + if (jobserver_client.get()) { + builder.SetJobserverClient(jobserver_client.get()); + } + for (size_t i = 0; i < targets.size(); ++i) { if (!builder.AddTarget(targets[i], &err)) { if (!err.empty()) { @@ -1662,6 +1727,7 @@ int ReadFlags(int* argc, char*** argv, // We want to run N jobs in parallel. For N = 0, INT_MAX // is close enough to infinite for most sane builds. config->parallelism = value > 0 ? value : INT_MAX; + config->disable_jobserver_client = true; deferGuessParallelism.needGuess = false; break; } @@ -1687,6 +1753,7 @@ int ReadFlags(int* argc, char*** argv, } case 'n': config->dry_run = true; + config->disable_jobserver_client = true; break; case 't': options->tool = ChooseTool(optarg); diff --git a/src/real_command_runner.cc b/src/real_command_runner.cc index 453652f5e5..e7b206509d 100644 --- a/src/real_command_runner.cc +++ b/src/real_command_runner.cc @@ -13,18 +13,30 @@ // limitations under the License. #include "build.h" +#include "jobserver.h" #include "subprocess.h" struct RealCommandRunner : public CommandRunner { - explicit RealCommandRunner(const BuildConfig& config) : config_(config) {} + explicit RealCommandRunner(const BuildConfig& config, + Jobserver::Client* jobserver) + : config_(config), jobserver_(jobserver) {} size_t CanRunMore() const override; bool StartCommand(Edge* edge) override; bool WaitForCommand(Result* result) override; std::vector GetActiveEdges() override; void Abort() override; + void ClearJobTokens() { + if (jobserver_) { + for (Edge* edge : GetActiveEdges()) { + jobserver_->Release(std::move(edge->job_slot_)); + } + } + } + const BuildConfig& config_; SubprocessSet subprocs_; + Jobserver::Client* jobserver_ = nullptr; std::map subproc_to_edge_; }; @@ -38,6 +50,7 @@ std::vector RealCommandRunner::GetActiveEdges() { } void RealCommandRunner::Abort() { + ClearJobTokens(); subprocs_.Clear(); } @@ -93,6 +106,7 @@ bool RealCommandRunner::WaitForCommand(Result* result) { return true; } -CommandRunner* CommandRunner::factory(const BuildConfig& config) { - return new RealCommandRunner(config); +CommandRunner* CommandRunner::factory(const BuildConfig& config, + Jobserver::Client* jobserver) { + return new RealCommandRunner(config, jobserver); }