From c4dc52786e9cc8d4aa11ce4174706eb14247ca7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Thu, 20 Feb 2025 23:31:57 -0600 Subject: [PATCH 1/3] agent: Fix race condition with cgroup watchers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the CI, test containers intermittently fail to start after creation, with an error like below (see #10872 for more details): # State: Terminated # Reason: StartError # Message: failed to start containerd task "afd43e77fae0815afbc7205eac78f94859e247968a6a4e8bcbb987690fcf10a6": No such file or directory (os error 2) I've observed this error to repro with the following containers, which have in common that they're all *very short-lived* by design (more tests might be affected): * k8s-job.bats * k8s-seccomp.bats * k8s-hostname.bats * k8s-policy-job.bats * k8s-policy-logs.bats Furthermore, appending a `; sleep 1` to the command line for those containers seemed to consistently get rid of the error. Investigating further, I've uncovered a race between the end of the container process and the setting up of the cgroup watchers (to report OOMs). If the process terminates first, the agent will try to watch cgroup paths that don't exist anymore, and it will fail to start the container. The added error context in notifier.rs confirms that the error comes from the missing cgroup: https://github.com/kata-containers/kata-containers/actions/runs/13450787436/job/37585901466#step:17:6536 The fix simply consists in creating the watchers *before* we start the container but still *after* we create it -- this is non-blocking, and IIUC the cgroup is guaranteed to already be present then. Fixes: #10872 Signed-off-by: Aurélien Bombo --- src/agent/rustjail/src/cgroups/notifier.rs | 12 ++++++++++-- src/agent/src/rpc.rs | 21 +++++++++++---------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/agent/rustjail/src/cgroups/notifier.rs b/src/agent/rustjail/src/cgroups/notifier.rs index 5260a3d3f29b..9b74967811ff 100644 --- a/src/agent/rustjail/src/cgroups/notifier.rs +++ b/src/agent/rustjail/src/cgroups/notifier.rs @@ -76,9 +76,17 @@ async fn register_memory_event_v2( let mut inotify = Inotify::init().context("Failed to initialize inotify")?; // watching oom kill - let ev_wd = inotify.add_watch(&event_control_path, WatchMask::MODIFY)?; + let ev_wd = inotify + .add_watch(&event_control_path, WatchMask::MODIFY) + .context(format!("failed to add watch for {:?}", &event_control_path))?; + // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited - let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?; + let cg_wd = inotify + .add_watch(&cgroup_event_control_path, WatchMask::MODIFY) + .context(format!( + "failed to add watch for {:?}", + &cgroup_event_control_path + ))?; info!(sl(), "ev_wd: {:?}", ev_wd); info!(sl(), "cg_wd: {:?}", cg_wd); diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index e19933882488..63966e266d24 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -293,24 +293,25 @@ impl AgentService { async fn do_start_container(&self, req: protocols::agent::StartContainerRequest) -> Result<()> { let mut s = self.sandbox.lock().await; let sid = s.id.clone(); - let cid = req.container_id; + let cid = req.container_id.clone(); let ctr = s .get_container(&cid) .ok_or_else(|| anyhow!("Invalid container id"))?; - ctr.exec().await?; - if sid == cid { - return Ok(()); + if sid != cid { + // start oom event loop + if let Ok(cg_path) = ctr.cgroup_manager.as_ref().get_cgroup_path("memory") { + let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?; + s.run_oom_event_monitor(rx, cid.clone()).await; + } } - // start oom event loop - if let Ok(cg_path) = ctr.cgroup_manager.as_ref().get_cgroup_path("memory") { - let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?; - s.run_oom_event_monitor(rx, cid).await; - } + let ctr = s + .get_container(&cid) + .ok_or_else(|| anyhow!("Invalid container id"))?; - Ok(()) + ctr.exec().await } #[instrument] From cb4236c0cb63ea98e036a1e663c21af9c8e38b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Fri, 21 Feb 2025 11:41:07 -0600 Subject: [PATCH 2/3] runtime: cgroups: Remove commented out code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doesn't seem like we're going to use this and it's confusing when inspecting code. Signed-off-by: Aurélien Bombo --- src/runtime/virtcontainers/kata_agent.go | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 9439afca1bdc..cb9b9e028c9c 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -1001,20 +1001,6 @@ func (k *kataAgent) constrainGRPCSpec(grpcSpec *grpc.Spec, passSeccomp bool, dis grpcSpec.Linux.Resources.CPU.Mems = "" } - // We need agent systemd cgroup now. - // There are three main reasons to do not apply systemd cgroups in the VM - // - Initrd image doesn't have systemd. - // - Nobody will be able to modify the resources of a specific container by using systemctl set-property. - // - docker is not running in the VM. - // if resCtrl.IsSystemdCgroup(grpcSpec.Linux.CgroupsPath) { - // // Convert systemd cgroup to cgroupfs - // slice := strings.Split(grpcSpec.Linux.CgroupsPath, ":") - // // 0 - slice: system.slice - // // 1 - prefix: docker - // // 2 - name: abc123 - // grpcSpec.Linux.CgroupsPath = filepath.Join("/", slice[1], slice[2]) - // } - // Disable network namespace since it is already handled on the host by // virtcontainers. The network is a complex part which cannot be simply // passed to the agent. From 3c4535a176c6f8c71f47a0ca23bf285f945abbbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Fri, 21 Feb 2025 08:28:59 -0600 Subject: [PATCH 3/3] ci: Fix GH throttling in run-nerdctl-tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Specify a GH API token to avoid the below throttling error: https://github.com/kata-containers/kata-containers/actions/runs/13450787436/job/37585810679?pr=10911#step:4:96 Signed-off-by: Aurélien Bombo --- .github/workflows/basic-ci-amd64.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/basic-ci-amd64.yaml b/.github/workflows/basic-ci-amd64.yaml index 714f27f7f0bc..c3202b7c9e3f 100644 --- a/.github/workflows/basic-ci-amd64.yaml +++ b/.github/workflows/basic-ci-amd64.yaml @@ -304,6 +304,8 @@ jobs: TARGET_BRANCH: ${{ inputs.target-branch }} - name: Install dependencies + env: + GITHUB_API_TOKEN: ${{ github.token }} run: bash tests/integration/nerdctl/gha-run.sh install-dependencies - name: get-kata-tarball