diff --git a/pkg/daemon/controlplane.go b/pkg/daemon/controlplane.go new file mode 100644 index 0000000000..d32c650a5b --- /dev/null +++ b/pkg/daemon/controlplane.go @@ -0,0 +1,79 @@ +package daemon + +// This file provides changes that we make to the control plane +// only. + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/golang/glog" + "github.com/pkg/errors" +) + +// setRootDeviceSchedulerBFQ switches to the `bfq` I/O scheduler +// for the root block device to better share I/O between etcd +// and other processes. See +// https://github.com/openshift/machine-config-operator/issues/1897 +// Note this is the current systemd default in Fedora, but not RHEL8, +// except for NVMe devices. +func setRootDeviceSchedulerBFQ() error { + sched := "bfq" + + rootDevSysfs, err := getRootBlockDeviceSysfs() + if err != nil { + return err + } + + schedulerPath := filepath.Join(rootDevSysfs, "/queue/scheduler") + schedulerContentsBuf, err := ioutil.ReadFile(schedulerPath) + if err != nil { + return err + } + schedulerContents := string(schedulerContentsBuf) + if strings.Contains(schedulerContents, fmt.Sprintf("[%s]", sched)) { + glog.Infof("Device %s already uses scheduler %s", rootDevSysfs, sched) + return nil + } + + f, err := os.OpenFile(schedulerPath, os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return err + } + defer f.Close() + _, err = f.Write([]byte(sched)) + if err != nil { + return err + } + glog.Infof("Set root blockdev %s to use scheduler %v", rootDevSysfs, sched) + + return nil +} + +// updateOstreeObjectSync enables "per-object-fsync" which helps avoid +// latency spikes for etcd; see https://github.com/ostreedev/ostree/pull/2152 +func updateOstreeObjectSync() error { + if err := exec.Command("ostree", "--repo=/sysroot/ostree/repo", "config", "set", "core.per-object-fsync", "true").Run(); err != nil { + return errors.Wrapf(err, "Failed to set per-object-fsync for ostree") + } + return nil +} + +// initializeControlPlane performs setup for the node that should +// only occur on the control plane. Currently this switches the IO +// scheduler and starts a goroutine acting as a small controller +// for reflecting the etcd leader status in the node object to help +// the MCC coordinate control plane updates. +func (dn *Daemon) initializeControlPlane() error { + if err := setRootDeviceSchedulerBFQ(); err != nil { + return err + } + if err := updateOstreeObjectSync(); err != nil { + return err + } + return nil +} diff --git a/pkg/daemon/coreos.go b/pkg/daemon/coreos.go new file mode 100644 index 0000000000..a477ed6ce7 --- /dev/null +++ b/pkg/daemon/coreos.go @@ -0,0 +1,52 @@ +package daemon + +// This file provides routines that apply on Fedora CoreOS style systems, +// including deriviatives like RHEL CoreOS. + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/pkg/errors" +) + +// byLabel returns the udev generated symlink to the block device with the given label +func byLabel(label string) string { + return fmt.Sprintf("/dev/disk/by-label/%s", label) +} + +// getParentDeviceSysfs returns e.g. /sys/devices/pci0000:00/0000:00:05.0/virtio2/block/vda from /dev/vda4, though +// it can be more complex than that with e.g. NVMe. +func getParentDeviceSysfs(device string) (string, error) { + target, err := os.Readlink(device) + if err != nil { + return "", errors.Wrapf(err, "reading %s", device) + } + sysfsDevLink := fmt.Sprintf("/sys/class/block/%s", filepath.Base(target)) + sysfsDev, err := filepath.EvalSymlinks(sysfsDevLink) + if err != nil { + return "", errors.Wrapf(err, "parsing %s", sysfsDevLink) + } + if _, err := os.Stat(filepath.Join(sysfsDev, "partition")); err == nil { + sysfsDev = filepath.Dir(sysfsDev) + } + return sysfsDev, nil +} + +// getRootBlockDeviceSysfs returns the path to the block +// device backing the root partition on a FCOS system +func getRootBlockDeviceSysfs() (string, error) { + // Check for the `crypt_rootfs` label; this exists for RHCOS >= 4.3 but <= 4.6. + // See https://github.com/openshift/enhancements/blob/master/enhancements/rhcos/automated-policy-based-disk-encryption.md + luksRoot := byLabel("crypt_rootfs") + if _, err := os.Stat(luksRoot); err == nil { + return getParentDeviceSysfs(luksRoot) + } + // This is what we expect on FCOS and RHCOS <= 4.2 + root := byLabel("root") + if _, err := os.Stat(root); err == nil { + return getParentDeviceSysfs(root) + } + return "", fmt.Errorf("Failed to find %s", root) +} diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 22b2b6fe82..87842dcb8a 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -113,7 +113,12 @@ type Daemon struct { // isControlPlane is true if this node is a control plane (master). // The machine may also be a worker (with schedulable masters). isControlPlane bool - booting bool + // nodeInitialized is true when we've performed one-time initialization + // after having updated the node object + nodeInitialized bool + // booting is true when all initial synchronization to the target + // machineconfig is done + booting bool currentConfigPath string @@ -382,15 +387,24 @@ func (dn *Daemon) updateErrorState(err error) { } } -// initializeNode is called the first time we get our node object -func (dn *Daemon) initializeNode() { +// initializeNode is called the first time we get our node object; however to +// ensure we handle failures: everything called from here should be idempotent. +func (dn *Daemon) initializeNode() error { + if dn.nodeInitialized { + return nil + } // Some parts of the MCO dispatch on whether or not we're managing a control plane node if _, isControlPlane := dn.node.Labels[ctrlcommon.MasterLabel]; isControlPlane { glog.Infof("Node %s is part of the control plane", dn.node.Name) + if err := dn.initializeControlPlane(); err != nil { + return err + } dn.isControlPlane = true } else { glog.Infof("Node %s is not labeled %s", dn.node.Name, ctrlcommon.MasterLabel) } + dn.nodeInitialized = true + return nil } func (dn *Daemon) syncNode(key string) error { @@ -428,7 +442,9 @@ func (dn *Daemon) syncNode(key string) error { node = node.DeepCopy() if dn.node == nil { dn.node = node - dn.initializeNode() + if err := dn.initializeNode(); err != nil { + return err + } } else { dn.node = node } diff --git a/pkg/daemon/pivot/utils/run.go b/pkg/daemon/pivot/utils/run.go index 34193809e8..569ab39f59 100644 --- a/pkg/daemon/pivot/utils/run.go +++ b/pkg/daemon/pivot/utils/run.go @@ -65,3 +65,16 @@ func RunExt(retries int, command string, args ...string) (string, error) { }, command, args...) } + +// RunExtBackground is like RunExt, but queues the command for "nice" CPU and +// I/O scheduling. +func RunExtBackground(retries int, command string, args ...string) (string, error) { + args = append([]string{"--", "ionice", "-c", "3", command}, args...) + command = "nice" + return runExtBackoff(wait.Backoff{ + Steps: retries + 1, // times to try + Duration: 5 * time.Second, // sleep between tries + Factor: 2, // factor by which to increase sleep + }, + command, args...) +} diff --git a/pkg/daemon/pivot/utils/run_test.go b/pkg/daemon/pivot/utils/run_test.go index 0fc714a702..27dacd1614 100644 --- a/pkg/daemon/pivot/utils/run_test.go +++ b/pkg/daemon/pivot/utils/run_test.go @@ -27,3 +27,9 @@ func TestRunExt(t *testing.T) { assert.Nil(t, err) assert.Equal(t, int64(3), s.Size()) } + +func TestRunExtBackground(t *testing.T) { + o, err := RunExtBackground(0, "echo", "echo", "from", "TestRunExtBackground") + assert.Nil(t, err) + assert.Equal(t, o, "echo from TestRunExtBackground") +} diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 6a585682e0..988e750da2 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -238,7 +238,7 @@ func podmanCopy(imgURL, osImageContentDir string) (err error) { args := []string{"pull", "-q"} args = append(args, authArgs...) args = append(args, imgURL) - _, err = pivotutils.RunExt(numRetriesNetCommands, "podman", args...) + _, err = pivotutils.RunExtBackground(numRetriesNetCommands, "podman", args...) if err != nil { return } @@ -257,7 +257,7 @@ func podmanCopy(imgURL, osImageContentDir string) (err error) { // copy the content from create container locally into a temp directory under /run/machine-os-content/ cid := strings.TrimSpace(string(cidBuf)) args = []string{"cp", fmt.Sprintf("%s:/", cid), osImageContentDir} - _, err = pivotutils.RunExt(numRetriesNetCommands, "podman", args...) + _, err = pivotutils.RunExtBackground(numRetriesNetCommands, "podman", args...) // Set selinux context to var_run_t to avoid selinux denial args = []string{"-R", "-t", "var_run_t", osImageContentDir} @@ -294,7 +294,7 @@ func ExtractOSImage(imgURL string) (osImageContentDir string, err error) { args := []string{"image", "extract", "--path", "/:" + osImageContentDir} args = append(args, registryConfig...) args = append(args, imgURL) - if _, err = pivotutils.RunExt(cmdRetriesCount, "oc", args...); err != nil { + if _, err = pivotutils.RunExtBackground(cmdRetriesCount, "oc", args...); err != nil { // Workaround fixes for the environment where oc image extract fails. // See https://bugzilla.redhat.com/show_bug.cgi?id=1862979 glog.Infof("Falling back to using podman cp to fetch OS image content") diff --git a/templates/master/00-master/_base/units/rpm-ostreed.service.yaml b/templates/master/00-master/_base/units/rpm-ostreed.service.yaml new file mode 100644 index 0000000000..6a9e214729 --- /dev/null +++ b/templates/master/00-master/_base/units/rpm-ostreed.service.yaml @@ -0,0 +1,9 @@ +name: rpm-ostreed.service +dropins: +- name: mco-controlplane-nice.conf + contents: | + # See https://github.com/openshift/machine-config-operator/issues/1897 + [Service] + Nice=10 + IOSchedulingClass=best-effort + IOSchedulingPriority=6