openshift · openshift-merge-robot · Aug 26, 2020 · Jul 30, 2020 · Aug 4, 2020 · smarterclayton
diff --git a/pkg/daemon/controlplane.go b/pkg/daemon/controlplane.go
@@ -0,0 +1,79 @@
+package daemon
+
+// This file provides changes that we make to the control plane
+// only.
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+
+	"github.com/golang/glog"
+	"github.com/pkg/errors"
+)
+
+// setRootDeviceSchedulerBFQ switches to the `bfq` I/O scheduler
+// for the root block device to better share I/O between etcd
+// and other processes.  See
+// https://github.com/openshift/machine-config-operator/issues/1897
+// Note this is the current systemd default in Fedora, but not RHEL8,
+// except for NVMe devices.
+func setRootDeviceSchedulerBFQ() error {
+	sched := "bfq"
+
+	rootDevSysfs, err := getRootBlockDeviceSysfs()
+	if err != nil {
+		return err
+	}
+
+	schedulerPath := filepath.Join(rootDevSysfs, "/queue/scheduler")
+	schedulerContentsBuf, err := ioutil.ReadFile(schedulerPath)
+	if err != nil {
+		return err
+	}
+	schedulerContents := string(schedulerContentsBuf)
+	if strings.Contains(schedulerContents, fmt.Sprintf("[%s]", sched)) {
+		glog.Infof("Device %s already uses scheduler %s", rootDevSysfs, sched)
+		return nil
+	}
+
+	f, err := os.OpenFile(schedulerPath, os.O_WRONLY|os.O_TRUNC, 0644)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+	_, err = f.Write([]byte(sched))
+	if err != nil {
+		return err
+	}
+	glog.Infof("Set root blockdev %s to use scheduler %v", rootDevSysfs, sched)
+
+	return nil
+}
+
+// updateOstreeObjectSync enables "per-object-fsync" which helps avoid
+// latency spikes for etcd; see https://github.com/ostreedev/ostree/pull/2152
+func updateOstreeObjectSync() error {
+	if err := exec.Command("ostree", "--repo=/sysroot/ostree/repo", "config", "set", "core.per-object-fsync", "true").Run(); err != nil {
+		return errors.Wrapf(err, "Failed to set per-object-fsync for ostree")
+	}
+	return nil
+}
+
+// initializeControlPlane performs setup for the node that should
+// only occur on the control plane.  Currently this switches the IO
+// scheduler and starts a goroutine acting as a small controller
+// for reflecting the etcd leader status in the node object to help
+// the MCC coordinate control plane updates.
+func (dn *Daemon) initializeControlPlane() error {
+	if err := setRootDeviceSchedulerBFQ(); err != nil {
+		return err
+	}
+	if err := updateOstreeObjectSync(); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/pkg/daemon/coreos.go b/pkg/daemon/coreos.go
@@ -0,0 +1,52 @@
+package daemon
+
+// This file provides routines that apply on Fedora CoreOS style systems,
+// including deriviatives like RHEL CoreOS.
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/pkg/errors"
+)
+
+// byLabel returns the udev generated symlink to the block device with the given label
+func byLabel(label string) string {
+	return fmt.Sprintf("/dev/disk/by-label/%s", label)
+}
+
+// getParentDeviceSysfs returns e.g. /sys/devices/pci0000:00/0000:00:05.0/virtio2/block/vda from /dev/vda4, though
+// it can be more complex than that with e.g. NVMe.
+func getParentDeviceSysfs(device string) (string, error) {
+	target, err := os.Readlink(device)
+	if err != nil {
+		return "", errors.Wrapf(err, "reading %s", device)
+	}
+	sysfsDevLink := fmt.Sprintf("/sys/class/block/%s", filepath.Base(target))
+	sysfsDev, err := filepath.EvalSymlinks(sysfsDevLink)
+	if err != nil {
+		return "", errors.Wrapf(err, "parsing %s", sysfsDevLink)
+	}
+	if _, err := os.Stat(filepath.Join(sysfsDev, "partition")); err == nil {
+		sysfsDev = filepath.Dir(sysfsDev)
+	}
+	return sysfsDev, nil
+}
+
+// getRootBlockDeviceSysfs returns the path to the block
+// device backing the root partition on a FCOS system
+func getRootBlockDeviceSysfs() (string, error) {
+	// Check for the `crypt_rootfs` label; this exists for RHCOS >= 4.3 but <= 4.6.
+	// See https://github.com/openshift/enhancements/blob/master/enhancements/rhcos/automated-policy-based-disk-encryption.md
+	luksRoot := byLabel("crypt_rootfs")
+	if _, err := os.Stat(luksRoot); err == nil {
+		return getParentDeviceSysfs(luksRoot)
+	}
+	// This is what we expect on FCOS and RHCOS <= 4.2
+	root := byLabel("root")
+	if _, err := os.Stat(root); err == nil {
+		return getParentDeviceSysfs(root)
+	}
+	return "", fmt.Errorf("Failed to find %s", root)
+}
diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
@@ -113,7 +113,12 @@ type Daemon struct {
 	// isControlPlane is true if this node is a control plane (master).
 	// The machine may also be a worker (with schedulable masters).
 	isControlPlane bool
-	booting        bool
+	// nodeInitialized is true when we've performed one-time initialization
+	// after having updated the node object
+	nodeInitialized bool
+	// booting is true when all initial synchronization to the target
+	// machineconfig is done
+	booting bool
 
 	currentConfigPath string
 
@@ -382,15 +387,24 @@ func (dn *Daemon) updateErrorState(err error) {
 	}
 }
 
-// initializeNode is called the first time we get our node object
-func (dn *Daemon) initializeNode() {
+// initializeNode is called the first time we get our node object; however to
+// ensure we handle failures: everything called from here should be idempotent.
+func (dn *Daemon) initializeNode() error {
+	if dn.nodeInitialized {
+		return nil
+	}
 	// Some parts of the MCO dispatch on whether or not we're managing a control plane node
 	if _, isControlPlane := dn.node.Labels[ctrlcommon.MasterLabel]; isControlPlane {
 		glog.Infof("Node %s is part of the control plane", dn.node.Name)
+		if err := dn.initializeControlPlane(); err != nil {
+			return err
+		}
 		dn.isControlPlane = true
 	} else {
 		glog.Infof("Node %s is not labeled %s", dn.node.Name, ctrlcommon.MasterLabel)
 	}
+	dn.nodeInitialized = true
+	return nil
 }
 
 func (dn *Daemon) syncNode(key string) error {
@@ -428,7 +442,9 @@ func (dn *Daemon) syncNode(key string) error {
 	node = node.DeepCopy()
 	if dn.node == nil {
 		dn.node = node
-		dn.initializeNode()
+		if err := dn.initializeNode(); err != nil {
+			return err
+		}
 	} else {
 		dn.node = node
 	}

diff --git a/pkg/daemon/pivot/utils/run.go b/pkg/daemon/pivot/utils/run.go
@@ -65,3 +65,16 @@ func RunExt(retries int, command string, args ...string) (string, error) {
 	},
 		command, args...)
 }
+
+// RunExtBackground is like RunExt, but queues the command for "nice" CPU and
+// I/O scheduling.
+func RunExtBackground(retries int, command string, args ...string) (string, error) {
+	args = append([]string{"--", "ionice", "-c", "3", command}, args...)
+	command = "nice"
+	return runExtBackoff(wait.Backoff{
+		Steps:    retries + 1,     // times to try
+		Duration: 5 * time.Second, // sleep between tries
+		Factor:   2,               // factor by which to increase sleep
+	},
+		command, args...)
+}
diff --git a/pkg/daemon/pivot/utils/run_test.go b/pkg/daemon/pivot/utils/run_test.go
@@ -27,3 +27,9 @@ func TestRunExt(t *testing.T) {
 	assert.Nil(t, err)
 	assert.Equal(t, int64(3), s.Size())
 }
+
+func TestRunExtBackground(t *testing.T) {
+	o, err := RunExtBackground(0, "echo", "echo", "from", "TestRunExtBackground")
+	assert.Nil(t, err)
+	assert.Equal(t, o, "echo from TestRunExtBackground")
+}
diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go
@@ -238,7 +238,7 @@ func podmanCopy(imgURL, osImageContentDir string) (err error) {
 	args := []string{"pull", "-q"}
 	args = append(args, authArgs...)
 	args = append(args, imgURL)
-	_, err = pivotutils.RunExt(numRetriesNetCommands, "podman", args...)
+	_, err = pivotutils.RunExtBackground(numRetriesNetCommands, "podman", args...)
 	if err != nil {
 		return
 	}
@@ -257,7 +257,7 @@ func podmanCopy(imgURL, osImageContentDir string) (err error) {
 	// copy the content from create container locally into a temp directory under /run/machine-os-content/
 	cid := strings.TrimSpace(string(cidBuf))
 	args = []string{"cp", fmt.Sprintf("%s:/", cid), osImageContentDir}
-	_, err = pivotutils.RunExt(numRetriesNetCommands, "podman", args...)
+	_, err = pivotutils.RunExtBackground(numRetriesNetCommands, "podman", args...)
 
 	// Set selinux context to var_run_t to avoid selinux denial
 	args = []string{"-R", "-t", "var_run_t", osImageContentDir}
@@ -294,7 +294,7 @@ func ExtractOSImage(imgURL string) (osImageContentDir string, err error) {
 	args := []string{"image", "extract", "--path", "/:" + osImageContentDir}
 	args = append(args, registryConfig...)
 	args = append(args, imgURL)
-	if _, err = pivotutils.RunExt(cmdRetriesCount, "oc", args...); err != nil {
+	if _, err = pivotutils.RunExtBackground(cmdRetriesCount, "oc", args...); err != nil {
 		// Workaround fixes for the environment where oc image extract fails.
 		// See https://bugzilla.redhat.com/show_bug.cgi?id=1862979
 		glog.Infof("Falling back to using podman cp to fetch OS image content")

diff --git a/templates/master/00-master/_base/units/rpm-ostreed.service.yaml b/templates/master/00-master/_base/units/rpm-ostreed.service.yaml
@@ -0,0 +1,9 @@
+name: rpm-ostreed.service
+dropins:
+- name: mco-controlplane-nice.conf
+  contents: |
+    # See https://github.com/openshift/machine-config-operator/issues/1897
+    [Service]
+    Nice=10
+    IOSchedulingClass=best-effort
+    IOSchedulingPriority=6