From b0af842b3407f4b3d398f80a205b383703121a4e Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 5 May 2021 11:49:47 -0700 Subject: [PATCH 1/5] ci/gha: only leave CentOS 7 Signed-off-by: Kir Kolyshkin --- .github/workflows/test.yml | 133 ------------------------------------- 1 file changed, 133 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f125887cdc8..c29a5687782 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,114 +14,11 @@ on: pull_request: jobs: - test: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - # Dockre/Moby still builds runc with Go 1.13, so we should still support Go 1.13. - go-version: [1.13.x, 1.15.x, 1.16.x] - rootless: ["rootless", ""] - race: ["-race", ""] - - steps: - - - name: checkout - uses: actions/checkout@v2 - - - name: install deps - run: | - # criu repo - sudo add-apt-repository -y ppa:criu/ppa - # apt-add-repository runs apt update so we don't have to - sudo apt -q install libseccomp-dev criu - - - name: install go ${{ matrix.go-version }} - uses: actions/setup-go@v2 - with: - stable: '!contains(${{ matrix.go-version }}, "beta") && !contains(${{ matrix.go-version }}, "rc")' - go-version: ${{ matrix.go-version }} - - - name: build - run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all - - - name: install bats - uses: mig4/setup-bats@v1 - with: - bats-version: 1.2.1 - - - name: unit test - if: matrix.rootless != 'rootless' - run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest - - - name: add rootless user - if: matrix.rootless == 'rootless' - run: | - sudo useradd -u2000 -m -d/home/rootless -s/bin/bash rootless - # Allow root to execute `ssh rootless@localhost` in tests/rootless.sh - ssh-keygen -t ecdsa -N "" -f $HOME/rootless.key - sudo mkdir -m 0700 -p /home/rootless/.ssh - sudo cp $HOME/rootless.key.pub /home/rootless/.ssh/authorized_keys - sudo chown -R rootless.rootless /home/rootless - - - name: integration test (fs driver) - run: sudo -E PATH="$PATH" script -e -c 'make local${{ matrix.rootless }}integration' - - - name: integration test (systemd driver) - # can't use systemd driver with cgroupv1 - if: matrix.rootless != 'rootless' - run: sudo -E PATH="$PATH" script -e -c 'make RUNC_USE_SYSTEMD=yes local${{ matrix.rootless }}integration' - - - # cgroup v2 unified hierarchy + very recent kernel (openat2) - fedora: - # nested virtualization is only available on macOS hosts - runs-on: macos-10.15 - timeout-minutes: 30 - # only run it if others have passed - needs: [test] - steps: - - uses: actions/checkout@v2 - - - name: "Cache ~/.vagrant.d/boxes, using hash of Vagrantfile.fedora34" - uses: actions/cache@v2 - with: - path: ~/.vagrant.d/boxes - key: vagrant-${{ hashFiles('Vagrantfile.fedora34') }} - - - name: prepare vagrant - run: | - ln -sf Vagrantfile.fedora34 Vagrantfile - # Retry if it fails (download.fedoraproject.org returns 404 sometimes) - vagrant up || vagrant up - vagrant ssh-config >> ~/.ssh/config - - - name: system info - run: ssh default 'sh -exc "uname -a && systemctl --version && df -T"' - - - name: unit tests - run: ssh default 'cd /vagrant && sudo make localunittest' - - - name: cgroupv2 with systemd - run: ssh -tt default "sudo make -C /vagrant localintegration RUNC_USE_SYSTEMD=yes" - - - name: cgroupv2 with fs2 - run: ssh -tt default "sudo make -C /vagrant localintegration" - - - name: cgroupv2 with systemd (rootless) - run: ssh -tt default "sudo make -C /vagrant localrootlessintegration RUNC_USE_SYSTEMD=yes" - - - name: cgroupv2 with fs2 (rootless) - run: ssh -tt default "sudo make -C /vagrant localrootlessintegration" - - # kernel 3.10 (frankenized), systemd 219 centos7: # nested virtualization is only available on macOS hosts runs-on: macos-10.15 timeout-minutes: 15 - # only run it if others have passed - needs: [test] steps: - uses: actions/checkout@v2 @@ -153,33 +50,3 @@ jobs: # FIXME: rootless is skipped because of EPERM on writing cgroup.procs if: false run: ssh default "sudo -i make -C /vagrant localrootlessintegration" - - # We need to continue support for 32-bit ARM. - # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff. - # We are not interested in providing official support for i386. - cross-i386: - runs-on: ubuntu-20.04 - - steps: - - - name: checkout - uses: actions/checkout@v2 - - - name: install deps - run: | - sudo dpkg --add-architecture i386 - # add criu repo - sudo add-apt-repository -y ppa:criu/ppa - # apt-add-repository runs apt update so we don't have to. - - # Due to a bug in apt, we have to update it first - # (see https://bugs.launchpad.net/ubuntu-cdimage/+bug/1871268) - sudo apt -q install apt - sudo apt -q install libseccomp-dev libseccomp-dev:i386 gcc-multilib criu - - - name: install go - uses: actions/setup-go@v2 # use default Go version - - - name: unit test - # cgo is disabled by default when cross-compiling - run: sudo -E PATH="$PATH" -- make GOARCH=386 CGO_ENABLED=1 localunittest From f55015092b3af8bb74d051c8be7271bd4b66b9a2 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 5 May 2021 13:08:49 -0700 Subject: [PATCH 2/5] libct/intelrdt: fix unit test 1. These tests can't be run in parallel since they do check a global variable (mbaScEnabled). 2. findIntelRdtMountpointDir() relies on mbaScEnabled to be initially set to the default value (false) and this the test fails if run more than once: > go test -count 2 > ... > intelrdt_test.go:243: expected mbaScEnabled=false, got true > --- FAIL: TestFindIntelRdtMountpointDir/Valid_mountinfo_with_MBA_Software_Controller_disabled (0.00s) Fixes: 2c70d2384 Signed-off-by: Kir Kolyshkin --- libcontainer/intelrdt/intelrdt_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/intelrdt/intelrdt_test.go b/libcontainer/intelrdt/intelrdt_test.go index 485442d723d..ef7abd33f5c 100644 --- a/libcontainer/intelrdt/intelrdt_test.go +++ b/libcontainer/intelrdt/intelrdt_test.go @@ -217,10 +217,10 @@ func TestFindIntelRdtMountpointDir(t *testing.T) { }, } - t.Parallel() for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { + mbaScEnabled = false mp, err := findIntelRdtMountpointDir(tc.input) if tc.isNotFoundError { if !IsNotFound(err) { From 6e07505dff6b33b081be9305de349a86b3d95498 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 5 May 2021 15:12:31 -0700 Subject: [PATCH 3/5] freezer: log success with Info level Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs/freezer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go index 37bbf38ff40..cf46cf2e9d0 100644 --- a/libcontainer/cgroups/fs/freezer.go +++ b/libcontainer/cgroups/fs/freezer.go @@ -75,7 +75,7 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { continue case string(configs.Frozen): if i > 1 { - logrus.Debugf("frozen after %d retries", i) + logrus.Infof("frozen after %d retries", i) } return nil default: From 8b29a73a7333d4247f814bbcdc547a2c2e7dfe29 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 5 May 2021 15:37:50 -0700 Subject: [PATCH 4/5] localunittest: only run Freeze 1000 times 500x each test (with and without systemd). Signed-off-by: Kir Kolyshkin --- .github/workflows/test.yml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c29a5687782..42e3b45f5e5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: centos7: # nested virtualization is only available on macOS hosts runs-on: macos-10.15 - timeout-minutes: 15 + timeout-minutes: 120 steps: - uses: actions/checkout@v2 diff --git a/Makefile b/Makefile index aff6d531c47..67f4179cebc 100644 --- a/Makefile +++ b/Makefile @@ -74,7 +74,7 @@ unittest: runcimage $(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS) localunittest: all - $(GO) test $(MOD_VENDOR) -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./... + $(GO) test $(MOD_VENDOR) -timeout 1h -count 500 -tags "$(BUILDTAGS)" $(TESTFLAGS) -v -run 'TestFreeze|TestSystemdFreeze' ./libcontainer/integration integration: runcimage $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ From 3573c5cf5591b7c8461cfce900b501926f65e330 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 5 May 2021 12:55:56 -0700 Subject: [PATCH 5/5] freezer: add delay after freeze MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I hate to keep adding those kludges, but lately TestFreeze (and TestSystemdFreeze) from libcontainer/integration fails a lot. The failure comes and goes, and is probably this is caused by a slow host allocated for the test, and a slow VM on top of it. To remediate, add a small sleep on every 25th iteration in between asking the kernel to freeze and checking its status. In the worst case scenario (failure to freeze) this adds 0.4 μs to the duration of the call (nothing compared to that sleep after the temporary thaw). It is hard to measure how this affects CI but (with added debug prints) on a histogram of number of retries I saw peaks at and after numbers 25, 50, 75 etc. meaning this works. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs/freezer.go | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go index cf46cf2e9d0..e972e8c1e3c 100644 --- a/libcontainer/cgroups/fs/freezer.go +++ b/libcontainer/cgroups/fs/freezer.go @@ -48,15 +48,16 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { // (either via fork/clone or by writing new PIDs to // cgroup.procs). // - // The numbers below are chosen to have a decent chance to - // succeed even in the worst case scenario (runc pause/unpause - // with parallel runc exec). + // The numbers below are empirically chosen to have a decent + // chance to succeed in various scenarios (such as "very slow + // VM" and "runc pause/unpause with parallel runc exec"), + // tested on RHEL7 kernel. // - // Adding any amount of sleep in between retries did not - // increase the chances of successful freeze. + // Alas, this is still a game of chances. for i := 0; i < 1000; i++ { if i%50 == 49 { - // Briefly thawing the cgroup also helps. + // Occasional thaw and sleep improves + // the chances to succeed in freezing. _ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed)) time.Sleep(10 * time.Millisecond) } @@ -65,6 +66,12 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { return err } + if i%25 == 24 { + // Occasional short sleep before reading + // the state back also improves the chances + // to succeed in freezing. + time.Sleep(10 * time.Microsecond) + } state, err := fscommon.ReadFile(path, "freezer.state") if err != nil { return err @@ -74,8 +81,8 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { case "FREEZING": continue case string(configs.Frozen): - if i > 1 { - logrus.Infof("frozen after %d retries", i) + if i > 0 { + logrus.Infof("frozen after %d attempts", i+1) } return nil default: