-
Notifications
You must be signed in to change notification settings - Fork 413
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bug 1850057: Use bfq scheduler on control plane, idle I/O for rpm-ostreed #1957
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
package daemon | ||
|
||
// This file provides changes that we make to the control plane | ||
// only. | ||
|
||
import ( | ||
"fmt" | ||
"io/ioutil" | ||
"os" | ||
"os/exec" | ||
"path/filepath" | ||
"strings" | ||
|
||
"github.com/golang/glog" | ||
"github.com/pkg/errors" | ||
) | ||
|
||
// setRootDeviceSchedulerBFQ switches to the `bfq` I/O scheduler | ||
// for the root block device to better share I/O between etcd | ||
// and other processes. See | ||
// https://github.com/openshift/machine-config-operator/issues/1897 | ||
// Note this is the current systemd default in Fedora, but not RHEL8, | ||
// except for NVMe devices. | ||
func setRootDeviceSchedulerBFQ() error { | ||
sched := "bfq" | ||
|
||
rootDevSysfs, err := getRootBlockDeviceSysfs() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
schedulerPath := filepath.Join(rootDevSysfs, "/queue/scheduler") | ||
schedulerContentsBuf, err := ioutil.ReadFile(schedulerPath) | ||
if err != nil { | ||
return err | ||
} | ||
schedulerContents := string(schedulerContentsBuf) | ||
if strings.Contains(schedulerContents, fmt.Sprintf("[%s]", sched)) { | ||
glog.Infof("Device %s already uses scheduler %s", rootDevSysfs, sched) | ||
return nil | ||
} | ||
|
||
f, err := os.OpenFile(schedulerPath, os.O_WRONLY|os.O_TRUNC, 0644) | ||
if err != nil { | ||
return err | ||
} | ||
defer f.Close() | ||
_, err = f.Write([]byte(sched)) | ||
if err != nil { | ||
return err | ||
} | ||
glog.Infof("Set root blockdev %s to use scheduler %v", rootDevSysfs, sched) | ||
|
||
return nil | ||
} | ||
|
||
// updateOstreeObjectSync enables "per-object-fsync" which helps avoid | ||
// latency spikes for etcd; see https://github.com/ostreedev/ostree/pull/2152 | ||
func updateOstreeObjectSync() error { | ||
if err := exec.Command("ostree", "--repo=/sysroot/ostree/repo", "config", "set", "core.per-object-fsync", "true").Run(); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question: do you think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you think it's important enough I can try to make new release images with just this change and not the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it's important enough to change anything I am just curious. You have done enough work :) |
||
return errors.Wrapf(err, "Failed to set per-object-fsync for ostree") | ||
} | ||
return nil | ||
} | ||
|
||
// initializeControlPlane performs setup for the node that should | ||
// only occur on the control plane. Currently this switches the IO | ||
// scheduler and starts a goroutine acting as a small controller | ||
// for reflecting the etcd leader status in the node object to help | ||
// the MCC coordinate control plane updates. | ||
func (dn *Daemon) initializeControlPlane() error { | ||
if err := setRootDeviceSchedulerBFQ(); err != nil { | ||
return err | ||
} | ||
if err := updateOstreeObjectSync(); err != nil { | ||
return err | ||
} | ||
return nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package daemon | ||
|
||
// This file provides routines that apply on Fedora CoreOS style systems, | ||
// including deriviatives like RHEL CoreOS. | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
"path/filepath" | ||
|
||
"github.com/pkg/errors" | ||
) | ||
|
||
// byLabel returns the udev generated symlink to the block device with the given label | ||
func byLabel(label string) string { | ||
return fmt.Sprintf("/dev/disk/by-label/%s", label) | ||
} | ||
|
||
// getParentDeviceSysfs returns e.g. /sys/devices/pci0000:00/0000:00:05.0/virtio2/block/vda from /dev/vda4, though | ||
// it can be more complex than that with e.g. NVMe. | ||
func getParentDeviceSysfs(device string) (string, error) { | ||
target, err := os.Readlink(device) | ||
if err != nil { | ||
return "", errors.Wrapf(err, "reading %s", device) | ||
} | ||
sysfsDevLink := fmt.Sprintf("/sys/class/block/%s", filepath.Base(target)) | ||
sysfsDev, err := filepath.EvalSymlinks(sysfsDevLink) | ||
if err != nil { | ||
return "", errors.Wrapf(err, "parsing %s", sysfsDevLink) | ||
} | ||
if _, err := os.Stat(filepath.Join(sysfsDev, "partition")); err == nil { | ||
sysfsDev = filepath.Dir(sysfsDev) | ||
} | ||
return sysfsDev, nil | ||
} | ||
|
||
// getRootBlockDeviceSysfs returns the path to the block | ||
// device backing the root partition on a FCOS system | ||
func getRootBlockDeviceSysfs() (string, error) { | ||
// Check for the `crypt_rootfs` label; this exists for RHCOS >= 4.3 but <= 4.6. | ||
// See https://github.com/openshift/enhancements/blob/master/enhancements/rhcos/automated-policy-based-disk-encryption.md | ||
luksRoot := byLabel("crypt_rootfs") | ||
if _, err := os.Stat(luksRoot); err == nil { | ||
return getParentDeviceSysfs(luksRoot) | ||
} | ||
// This is what we expect on FCOS and RHCOS <= 4.2 | ||
root := byLabel("root") | ||
if _, err := os.Stat(root); err == nil { | ||
return getParentDeviceSysfs(root) | ||
} | ||
return "", fmt.Errorf("Failed to find %s", root) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name: rpm-ostreed.service | ||
dropins: | ||
- name: mco-controlplane-nice.conf | ||
contents: | | ||
# See https://github.com/openshift/machine-config-operator/issues/1897 | ||
[Service] | ||
Nice=10 | ||
IOSchedulingClass=best-effort | ||
IOSchedulingPriority=6 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So why wouldn't we do this for all root devices on all nodes in the cluster?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mostly to limit the "blast radius" of this PR - changing the control plane only minimizes risk. But, it does also mean more divergence.
It probably does make sense to do an across-the-board switch though...maybe as a followup to this? Or would you rather do it in one go?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The other aspect is that currently the OS update vs etcd happens because we're updating the OS while etcd is still running - that only matters on the control plane. For regular worker nodes that don't have static pods, everything will have been drained before we start the update - so there's less need for bfq inherently.
(Long term I think we really want cgroups v2 + a filesystem that honors IO priorities - it's the only way to get real balance, bfq is mostly heuristics + support for baseline io priorities)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sinnykumari @runcom @cgwalters @smarterclayton I would suggest to sync up with the perf-dept, there were some concerns regarding BFQ as the default scheduler in RHEL. There are several workloads where BFQ did not perform the best. MQ-deadline did better. There are probably more workloads that need to be strobed.
Even though BFQ is great on the low end, past experience shows it doesn't do as well on the high end. It would need to be thoroughly tested if RHCOS considered changing it. See http://post-office.corp.redhat.com/archives/kscale-list/2019-September/msg00010.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The BFQ may work great for the control-plane where we're only running "one" workload (etcd), on the worker nodes we have a variety of workloads that may or not benefit from BFQ. Would also be interesting to see how the infra pods are reacting to such a change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for raising the concern. This PR uses BFQ only for control plane nodes. We will keep in mind and make sure to talk to perf team if we plan to use BFQ for worker/custom pools.