From 3221924dae94b472a3fe43261883caa405cf1e94 Mon Sep 17 00:00:00 2001 From: David Joshy Date: Tue, 6 Jun 2023 09:54:29 -0400 Subject: [PATCH] daemon: removed pending config checks --- pkg/daemon/daemon.go | 198 +++++++++---------------------------------- pkg/daemon/update.go | 149 +------------------------------- 2 files changed, 42 insertions(+), 305 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index adcb752068..6e4350a337 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -152,9 +152,6 @@ const ( // currentConfigPath is where we store the current config on disk to validate // against annotations changes currentConfigPath = "/etc/machine-config-daemon/currentconfig" - // pendingStateMessageID is the id we store the pending state in journal. We use it to - // also retrieve the pending config after a reboot - pendingStateMessageID = "machine-config-daemon-pending-state" // originalContainerBin is the path at which we've stashed the MCD container's /usr/bin // in the host namespace. We use this for executing any extra binaries we have in our @@ -426,17 +423,6 @@ func PrepareNamespace(target string) error { return nil } -// writer implements io.Writer interface as a pass-through for klog. -type writer struct { - logFunc func(args ...interface{}) -} - -// Write passes string(p) into writer's logFunc and always returns len(p) -func (w writer) Write(p []byte) (n int, err error) { - w.logFunc(string(p)) - return len(p), nil -} - // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. func (dn *Daemon) worker() { @@ -1324,11 +1310,10 @@ type stateAndConfigs struct { bootstrapping bool state string currentConfig *mcfgv1.MachineConfig - pendingConfig *mcfgv1.MachineConfig desiredConfig *mcfgv1.MachineConfig } -func (dn *Daemon) getStateAndConfigs(pendingConfigName string) (*stateAndConfigs, error) { +func (dn *Daemon) getStateAndConfigs() (*stateAndConfigs, error) { _, err := os.Lstat(constants.InitialNodeAnnotationsFilePath) var bootstrapping bool if err != nil { @@ -1395,20 +1380,6 @@ func (dn *Daemon) getStateAndConfigs(pendingConfigName string) (*stateAndConfigs } glog.Infof("state: %s", state) - var pendingConfig *mcfgv1.MachineConfig - // We usually expect that if current != desired, pending == desired; however, - // it can happen that desiredConfig changed while we were rebooting. - if pendingConfigName == desiredConfigName { - pendingConfig = desiredConfig - } else if pendingConfigName != "" { - pendingConfig, err = dn.mcLister.Get(pendingConfigName) - if err != nil { - return nil, err - } - - glog.Infof("Pending config: %s", pendingConfigName) - } - var degradedReason string if state == constants.MachineConfigDaemonStateDegraded { degradedReason, err = getNodeAnnotation(dn.node, constants.MachineConfigDaemonReasonAnnotationKey) @@ -1422,7 +1393,6 @@ func (dn *Daemon) getStateAndConfigs(pendingConfigName string) (*stateAndConfigs return &stateAndConfigs{ bootstrapping: bootstrapping, currentConfig: currentConfig, - pendingConfig: pendingConfig, desiredConfig: desiredConfig, state: state, }, nil @@ -1466,46 +1436,6 @@ func (dn *Daemon) LogSystemData() { } } -const ( - pendingConfigPath = "/etc/machine-config-daemon/state.json" -) - -type pendingConfigState struct { - PendingConfig string `json:"pendingConfig,omitempty"` - BootID string `json:"bootID,omitempty"` -} - -// XXX: drop this -func (dn *Daemon) writePendingConfig(desiredConfig *mcfgv1.MachineConfig) error { - t := &pendingConfigState{ - PendingConfig: desiredConfig.GetName(), - BootID: dn.bootID, - } - b, err := json.Marshal(t) - if err != nil { - return err - } - return writeFileAtomicallyWithDefaults(pendingConfigPath, b) -} - -// XXX: drop this -// we need this compatibility layer for now -func (dn *Daemon) getPendingConfig() (*pendingConfigState, error) { - s, err := os.ReadFile("/etc/machine-config-daemon/state.json") - if err != nil { - if !os.IsNotExist(err) { - return nil, fmt.Errorf("loading transient state: %w", err) - } - return nil, nil - } - var p pendingConfigState - if err := json.Unmarshal(s, &p); err != nil { - return nil, fmt.Errorf("parsing transient state: %w", err) - } - - return &p, nil -} - // getCurrentConfigOnDisk retrieves the serialized MachineConfig written to /etc // which exists during the time we're trying to perform an update. func (dn *Daemon) getCurrentConfigOnDisk() (*mcfgv1.MachineConfig, error) { @@ -1727,53 +1657,17 @@ func (dn *Daemon) checkStateOnFirstRun() error { // Update our cached copy dn.node = node - pendingState, err := dn.getPendingState() - if err != nil { - return err - } - var pendingConfigName, bootID string - if pendingState != nil { - pendingConfigName = pendingState.Message - bootID = pendingState.BootID - } - // XXX: drop this - // we need this compatibility layer for now - if pendingState == nil { - legacyPendingState, err := dn.getPendingConfig() - if err != nil { - return err - } - if legacyPendingState != nil { - pendingConfigName = legacyPendingState.PendingConfig - bootID = legacyPendingState.BootID - } - } - - state, err := dn.getStateAndConfigs(pendingConfigName) + state, err := dn.getStateAndConfigs() if err != nil { return err } - // if we have a pendingConfig but we're into the same bootid, we failed to drain or reboot - // and if we still have a pendingConfig it means we've been killed by kube after 600s - // take a stab at that and re-run the drain+reboot routine - if state.pendingConfig != nil && bootID == dn.bootID { - logSystem("drain interrupted, retrying") - if err := dn.performDrain(); err != nil { - return err - } - if err := dn.finalizeBeforeReboot(state.pendingConfig); err != nil { - return err - } - return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.pendingConfig.GetName())) - } - if err := dn.detectEarlySSHAccessesFromBoot(); err != nil { return fmt.Errorf("error detecting previous SSH accesses: %w", err) } if err := dn.removeRollback(); err != nil { - return fmt.Errorf("Failed to remove rollback: %w", err) + return fmt.Errorf("failed to remove rollback: %w", err) } // Bootstrapping state is when we have the node annotations file @@ -1786,7 +1680,7 @@ func (dn *Daemon) checkStateOnFirstRun() error { // Check to see if we have a layered/new format image isLayeredImage, err := dn.NodeUpdaterClient.IsBootableImage(targetOSImageURL) if err != nil { - return fmt.Errorf("Error checking type of target image: %w", err) + return fmt.Errorf("error checking type of target image: %w", err) } if isLayeredImage { @@ -1807,9 +1701,6 @@ func (dn *Daemon) checkStateOnFirstRun() error { return err } } - if err := dn.finalizeBeforeReboot(state.currentConfig); err != nil { - return err - } return dn.reboot(fmt.Sprintf("Node will reboot into config %v", state.currentConfig.GetName())) } logSystem("No bootstrap pivot required; unlinking bootstrap node annotations") @@ -1843,19 +1734,8 @@ func (dn *Daemon) checkStateOnFirstRun() error { // // In the case where we're booting a node for the first time, or the MCD // is restarted, that will be the current config. - // - // In the case where we have - // a pending config, this is where we validate that it actually applied. - // We currently just do this on startup, but in the future it could e.g. be - // a once-a-day or week cron job. - var expectedConfig *mcfgv1.MachineConfig - if state.pendingConfig != nil { - glog.Infof("Validating against pending config %s", state.pendingConfig.GetName()) - expectedConfig = state.pendingConfig - } else { - glog.Infof("Validating against current config %s", state.currentConfig.GetName()) - expectedConfig = state.currentConfig - } + + glog.Infof("Validating against current config %s", state.currentConfig.GetName()) if forceFileExists() { logSystem("Skipping on-disk validation; %s present", constants.MachineConfigDaemonForceFile) @@ -1865,12 +1745,12 @@ func (dn *Daemon) checkStateOnFirstRun() error { // When upgrading the OS, it is possible that the SSH key location will // change. We should detect whether that is the case and update before we // check for any config drift. - if err := dn.updateSSHKeyLocationIfNeeded(expectedConfig); err != nil { + if err := dn.updateSSHKeyLocationIfNeeded(state.currentConfig); err != nil { return err } - if err := dn.validateOnDiskState(expectedConfig); err != nil { - wErr := fmt.Errorf("unexpected on-disk state validating against %s: %w", expectedConfig.GetName(), err) + if err := dn.validateOnDiskState(state.currentConfig); err != nil { + wErr := fmt.Errorf("unexpected on-disk state validating against %s: %w", state.currentConfig.GetName(), err) dn.nodeWriter.Eventf(corev1.EventTypeWarning, "OnDiskStateValidationFailed", wErr.Error()) return wErr } @@ -1896,10 +1776,6 @@ func (dn *Daemon) checkStateOnFirstRun() error { // updateConfigAndState updates node to desired state, labels nodes as done and uncordon func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, error) { - // In the case where we had a pendingConfig, make that now currentConfig. - if state.pendingConfig != nil { - state.currentConfig = state.pendingConfig - } if state.bootstrapping { if err := dn.storeCurrentConfigOnDisk(state.currentConfig); err != nil { @@ -1907,32 +1783,38 @@ func (dn *Daemon) updateConfigAndState(state *stateAndConfigs) (bool, error) { } } + // Set the current config to the last written config to disk. This will be the last + // "successful" config update we have completed. + currentOnDisk, err := dn.getCurrentConfigOnDisk() + if err == nil { + state.currentConfig = currentOnDisk + } else { + glog.Infof("Error reading config from disk") + return false, fmt.Errorf("error reading config from disk: %w", err) + } + // In case of node reboot, it may be the case that desiredConfig changed while we // were coming up, so we next look at that before uncordoning the node (so // we don't uncordon and then immediately re-cordon) + inDesiredConfig := state.currentConfig.GetName() == state.desiredConfig.GetName() if inDesiredConfig { - if state.pendingConfig != nil { - // Great, we've successfully rebooted for the desired config, - // let's mark it done! - glog.Infof("Completing pending config %s", state.pendingConfig.GetName()) - if err := dn.completeUpdate(state.pendingConfig.GetName()); err != nil { - UpdateStateMetric(mcdUpdateState, "", err.Error()) - return inDesiredConfig, err - } - - // We update the node annotation, delete the state file, etc. - if dn.nodeWriter != nil { - dn.nodeWriter.Eventf(corev1.EventTypeNormal, "NodeDone", fmt.Sprintf("Setting node %s, currentConfig %s to Done", dn.node.Name, state.pendingConfig.GetName())) - } - if err := dn.nodeWriter.SetDone(state.pendingConfig.GetName()); err != nil { - return true, fmt.Errorf("error setting node's state to Done: %w", err) - } - if out, err := dn.storePendingState(state.pendingConfig, 0); err != nil { - return true, fmt.Errorf("failed to reset pending config: %s: %w", string(out), err) - } + // Great, we've successfully rebooted for the desired config, + // let's mark it done! + glog.Infof("Completing update to target config %s", state.currentConfig.GetName()) + if err := dn.completeUpdate(state.currentConfig.GetName()); err != nil { + UpdateStateMetric(mcdUpdateState, "", err.Error()) + return inDesiredConfig, err + } + // We update the node annotation, and pop an event saying we're done. + if dn.nodeWriter != nil { + dn.nodeWriter.Eventf(corev1.EventTypeNormal, "NodeDone", fmt.Sprintf("Setting node %s, currentConfig %s to Done", dn.node.Name, state.currentConfig.GetName())) + } + if err := dn.nodeWriter.SetDone(state.currentConfig.GetName()); err != nil { + return true, fmt.Errorf("error setting node's state to Done: %w", err) } + // If we're degraded here, it means we got an error likely on startup and we retried. // If that's the case, clear it out. if state.state == constants.MachineConfigDaemonStateDegraded { @@ -2068,7 +1950,7 @@ func (dn *Daemon) prepUpdateFromCluster() (*mcfgv1.MachineConfig, *mcfgv1.Machin // been completed. func (dn *Daemon) completeUpdate(desiredConfigName string) error { if err := dn.nodeWriter.SetDesiredDrainer(fmt.Sprintf("%s-%s", "uncordon", desiredConfigName)); err != nil { - return fmt.Errorf("Could not set drain annotation: %w", err) + return fmt.Errorf("could not set drain annotation: %w", err) } if err := wait.Poll(10*time.Second, 10*time.Minute, func() (bool, error) { @@ -2087,7 +1969,7 @@ func (dn *Daemon) completeUpdate(desiredConfigName string) error { dn.nodeWriter.Eventf(corev1.EventTypeWarning, "FailedToUncordon", failMsg) return fmt.Errorf(failMsg) } - return fmt.Errorf("Something went wrong while attempting to uncordon node: %v", err) + return fmt.Errorf("something went wrong while attempting to uncordon node: %v", err) } logSystem("Update completed for config %s and node has been successfully uncordoned", desiredConfigName) @@ -2158,7 +2040,7 @@ func (dn *CoreOSDaemon) validateKernelArguments(currentConfig *mcfgv1.MachineCon } glog.Infof("Current ostree kargs: %s", rpmostreeKargs) glog.Infof("Expected MachineConfig kargs: %v", expected) - return fmt.Errorf("Missing expected kernel arguments: %v", missing) + return fmt.Errorf("missing expected kernel arguments: %v", missing) } return nil } @@ -2323,9 +2205,5 @@ func forceFileExists() bool { _, err := os.Stat(constants.MachineConfigDaemonForceFile) // No error means we could stat the file; it exists - if err == nil { - return true - } - - return false + return err == nil } diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 9518cb4464..f69b2bdca0 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -13,7 +13,6 @@ import ( "reflect" "strconv" "strings" - "syscall" "time" "github.com/clarketm/json" @@ -105,7 +104,7 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string // We are here, which means reboot was not needed to apply the configuration. // Get current state of node, in case of an error reboot - state, err := dn.getStateAndConfigs(configName) + state, err := dn.getStateAndConfigs() if err != nil { return fmt.Errorf("could not apply update: error processing state and configs. Error: %w", err) } @@ -124,31 +123,6 @@ func (dn *Daemon) performPostConfigChangeAction(postConfigChangeActions []string return dn.triggerUpdateWithMachineConfig(state.currentConfig, state.desiredConfig, true) } -// finalizeBeforeReboot is the last step in an update() and then we take appropriate postConfigChangeAction. -// It can also be called as a special case for the "bootstrap pivot". -func (dn *Daemon) finalizeBeforeReboot(newConfig *mcfgv1.MachineConfig) (retErr error) { - if out, err := dn.storePendingState(newConfig, 1); err != nil { - return fmt.Errorf("failed to log pending config: %s: %w", string(out), err) - } - defer func() { - if retErr != nil { - if dn.nodeWriter != nil { - dn.nodeWriter.Eventf(corev1.EventTypeNormal, "PendingConfigRollBack", fmt.Sprintf("Rolling back pending config %s: %v", newConfig.GetName(), retErr)) - } - if out, err := dn.storePendingState(newConfig, 0); err != nil { - errs := kubeErrs.NewAggregate([]error{err, retErr}) - retErr = fmt.Errorf("error rolling back pending config %s: %w", string(out), errs) - return - } - } - }() - if dn.nodeWriter != nil { - dn.nodeWriter.Eventf(corev1.EventTypeNormal, "PendingConfig", fmt.Sprintf("Written pending config %s", newConfig.GetName())) - } - - return nil -} - func canonicalizeEmptyMC(config *mcfgv1.MachineConfig) *mcfgv1.MachineConfig { if config != nil { return config @@ -360,7 +334,7 @@ func (dn *CoreOSDaemon) applyOSChanges(mcDiff machineConfigDiff, oldConfig, newC // The steps from here on are different depending on the image type, so check the image type isLayeredImage, err := dn.NodeUpdaterClient.IsBootableImage(newConfig.Spec.OSImageURL) if err != nil { - return fmt.Errorf("Error checking type of update image: %w", err) + return fmt.Errorf("error checking type of update image: %w", err) } // TODO(jkyros): we can remove the format check and simplify this once we retire the "old format" images @@ -616,10 +590,6 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi } }() - if err := dn.finalizeBeforeReboot(newConfig); err != nil { - return err - } - return dn.performPostConfigChangeAction(actions, newConfig.GetName()) } @@ -1189,7 +1159,7 @@ func (dn *CoreOSDaemon) switchKernel(oldConfig, newConfig *mcfgv1.MachineConfig) return runRpmOstree(args...) } - return fmt.Errorf("Unhandled kernel type %s", newKtype) + return fmt.Errorf("unhandled kernel type %s", newKtype) } // updateFiles writes files specified by the nodeconfig to disk. it also writes @@ -1622,7 +1592,7 @@ func (dn *Daemon) SetPasswordHash(newUsers []ign3types.PasswdUser) error { } if out, err := exec.Command("usermod", "-p", pwhash, u.Name).CombinedOutput(); err != nil { - return fmt.Errorf("Failed to change password for %s: %s:%w", u.Name, out, err) + return fmt.Errorf("failed to change password for %s: %s:%w", u.Name, out, err) } glog.Info("Password has been configured") } @@ -1892,117 +1862,6 @@ func (dn *Daemon) updateLayeredOS(config *mcfgv1.MachineConfig) error { return nil } -func (dn *Daemon) getPendingStateLegacyLogger() (*journalMsg, error) { - glog.Info("logger doesn't support --jounald, grepping the journal") - - cmdLiteral := "journalctl -o cat _UID=0 | grep -v audit | grep OPENSHIFT_MACHINE_CONFIG_DAEMON_LEGACY_LOG_HACK" - cmd := exec.Command("bash", "-c", cmdLiteral) - var combinedOutput bytes.Buffer - cmd.Stdout = &combinedOutput - cmd.Stderr = &combinedOutput - if err := cmd.Start(); err != nil { - return nil, fmt.Errorf("failed shelling out to journalctl -o cat: %w", err) - } - if err := cmd.Wait(); err != nil { - if exiterr, ok := err.(*exec.ExitError); ok { - // The program has exited with an exit code != 0 - status, ok := exiterr.Sys().(syscall.WaitStatus) - if ok { - // grep exit with 1 if it doesn't find anything - // from man: Normally, the exit status is 0 if selected lines are found and 1 otherwise. But the exit status is 2 if an error occurred - if status.ExitStatus() == 1 { - return nil, nil - } - if status.ExitStatus() > 1 { - errs := kubeErrs.NewAggregate([]error{exiterr, fmt.Errorf("grep exited with %s", combinedOutput.Bytes())}) - return nil, fmt.Errorf("failed to grep on journal output: %w", errs) - } - } - } else { - return nil, fmt.Errorf("command wait error: %w", err) - } - } - journalOutput := combinedOutput.Bytes() - // just an extra safety check? - if len(journalOutput) == 0 { - return nil, nil - } - return dn.processJournalOutput(journalOutput) -} - -type journalMsg struct { - Message string `json:"MESSAGE,omitempty"` - BootID string `json:"BOOT_ID,omitempty"` - Pending string `json:"PENDING,omitempty"` - OldLogger string `json:"OPENSHIFT_MACHINE_CONFIG_DAEMON_LEGACY_LOG_HACK,omitempty"` // unused today -} - -func (dn *Daemon) processJournalOutput(journalOutput []byte) (*journalMsg, error) { - lines := strings.Split(strings.TrimSpace(string(journalOutput)), "\n") - last := lines[len(lines)-1] - - entry := &journalMsg{} - if err := json.Unmarshal([]byte(last), entry); err != nil { - return nil, fmt.Errorf("getting pending state from journal: %w", err) - } - if entry.Pending == "0" { - return nil, nil - } - return entry, nil -} - -// getPendingState loads the JSON state we cache across attempting to apply -// a config+reboot. If no pending state is available, ("", nil) will be returned. -// The bootID is stored in the pending state; if it is unchanged, we assume -// that we failed to reboot; that for now should be a fatal error, in order to avoid -// reboot loops. -func (dn *Daemon) getPendingState() (*journalMsg, error) { - if !dn.loggerSupportsJournal { - return dn.getPendingStateLegacyLogger() - } - journalOutput, err := exec.Command("journalctl", "-o", "json", "_UID=0", fmt.Sprintf("MESSAGE_ID=%s", pendingStateMessageID)).CombinedOutput() - if err != nil { - return nil, fmt.Errorf("error running journalctl -o json: %w", err) - } - if len(journalOutput) == 0 { - return nil, nil - } - return dn.processJournalOutput(journalOutput) -} - -func (dn *Daemon) storePendingStateLegacyLogger(pending *mcfgv1.MachineConfig, isPending int) ([]byte, error) { - glog.Info("logger doesn't support --jounald, logging json directly") - - if isPending == 1 { - if err := dn.writePendingConfig(pending); err != nil { - return nil, err - } - } else { - if err := os.Remove(pendingConfigPath); err != nil { - return nil, err - } - } - - oldLogger := exec.Command("logger", fmt.Sprintf(`{"MESSAGE": "%s", "BOOT_ID": "%s", "PENDING": "%d", "OPENSHIFT_MACHINE_CONFIG_DAEMON_LEGACY_LOG_HACK": "1"}`, pending.GetName(), dn.bootID, isPending)) - return oldLogger.CombinedOutput() -} - -func (dn *Daemon) storePendingState(pending *mcfgv1.MachineConfig, isPending int) ([]byte, error) { - if !dn.loggerSupportsJournal { - return dn.storePendingStateLegacyLogger(pending, isPending) - } - logger := exec.Command("logger", "--journald") - - var pendingState bytes.Buffer - pendingState.WriteString(fmt.Sprintf(`MESSAGE_ID=%s -MESSAGE=%s -BOOT_ID=%s -PENDING=%d`, pendingStateMessageID, pending.GetName(), dn.bootID, isPending)) - - logger.Stdin = &pendingState - return logger.CombinedOutput() -} - // Synchronously invoke a command, writing its stdout to our stdout, // and gathering stderr into a buffer which will be returned in err // in case of error.