From 1f870f7964ed57fc2c0168ed3402cd8caa427623 Mon Sep 17 00:00:00 2001 From: Zhang Tianyang Date: Fri, 5 Apr 2024 16:23:16 +0800 Subject: [PATCH 1/2] bugfix: residual sync clock thread Signed-off-by: Zhang Tianyang --- vmm/sandbox/src/client.rs | 138 +++++++++++++++++++++++++------------ vmm/sandbox/src/sandbox.rs | 2 +- 2 files changed, 94 insertions(+), 46 deletions(-) diff --git a/vmm/sandbox/src/client.rs b/vmm/sandbox/src/client.rs index 3a42f8d3..e6c45f44 100644 --- a/vmm/sandbox/src/client.rs +++ b/vmm/sandbox/src/client.rs @@ -15,13 +15,17 @@ limitations under the License. */ use std::{ - os::unix::io::{IntoRawFd, RawFd}, + os::fd::{IntoRawFd, RawFd}, + sync::Arc, time::Duration, }; use anyhow::anyhow; -use containerd_sandbox::error::{Error, Result}; -use log::{debug, error, warn}; +use containerd_sandbox::{ + error::{Error, Result}, + signal::ExitSignal, +}; +use log::{debug, error}; use nix::{ sys::{ socket::{connect, socket, AddressFamily, SockFlag, SockType, UnixAddr, VsockAddr}, @@ -258,59 +262,91 @@ pub(crate) async fn client_update_routes( Ok(()) } -pub(crate) async fn client_sync_clock(client: &SandboxServiceClient, id: &str) { +pub(crate) fn client_sync_clock( + client: &SandboxServiceClient, + id: &str, + exit_signal: Arc, +) { let id = id.to_string(); let client = client.clone(); - let tolerance_nanos = Duration::from_millis(TIME_DIFF_TOLERANCE_IN_MS).as_nanos() as i64; - let clock_id = ClockId::from_raw(nix::libc::CLOCK_REALTIME); + let tolerance_nanos = Duration::from_millis(TIME_DIFF_TOLERANCE_IN_MS); tokio::spawn(async move { - loop { - tokio::time::sleep(Duration::from_secs(TIME_SYNC_PERIOD)).await; - debug!("sync_clock {}: start sync clock from host to guest", id); - - let mut req = SyncClockPacket::new(); - match clock_gettime(clock_id) { - Ok(ts) => req.ClientSendTime = ts.num_nanoseconds(), - Err(e) => { - warn!("sync_clock {}: failed to get current clock: {}", id, e); - continue; - } - } - match client - .sync_clock(with_timeout(Duration::from_secs(1).as_nanos() as i64), &req) - .await - { - Ok(mut p) => { - match clock_gettime(clock_id) { - Ok(ts) => p.ServerArriveTime = ts.num_nanoseconds(), - Err(e) => { - warn!("sync_clock {}: failed to get current clock: {}", id, e); - continue; - } - } - p.Delta = ((p.ClientSendTime - p.ClientArriveTime) - + (p.ServerArriveTime - p.ServerSendTime)) - / 2; - if p.Delta.abs() > tolerance_nanos { - if let Err(e) = client - .sync_clock(with_timeout(Duration::from_secs(1).as_nanos() as i64), &p) - .await - { - error!("sync_clock {}: sync clock set delta failed: {:?}", id, e); - } - } - } - Err(e) => { - error!("sync_clock {}: get error: {:?}", id, e); + let fut = async { + loop { + tokio::time::sleep(Duration::from_secs(TIME_SYNC_PERIOD)).await; + if let Err(e) = do_once_sync_clock(&client, tolerance_nanos).await { + debug!("sync_clock {}: {:?}", id, e); } } + }; + + tokio::select! { + _ = fut => (), + _ = exit_signal.wait() => {}, } }); } +// Introduce a set of mechanism based on Precision Time Protocol to keep guest clock synchronized +// with host clock periodically. +async fn do_once_sync_clock( + client: &SandboxServiceClient, + tolerance_nanos: Duration, +) -> Result<()> { + let mut req = SyncClockPacket::new(); + let clock_id = ClockId::from_raw(nix::libc::CLOCK_REALTIME); + req.ClientSendTime = clock_gettime(clock_id) + .map_err(|e| anyhow!("get current clock: {}", e))? + .num_nanoseconds(); + + let mut p = client + .sync_clock(with_timeout(Duration::from_secs(1).as_nanos() as i64), &req) + .await + .map_err(|e| anyhow!("get guest clock packet: {:?}", e))?; + + p.ServerArriveTime = clock_gettime(clock_id) + .map_err(|e| anyhow!("get current clock: {}", e))? + .num_nanoseconds(); + + p.Delta = checked_compute_delta( + p.ClientSendTime, + p.ClientArriveTime, + p.ServerSendTime, + p.ServerArriveTime, + )?; + if p.Delta.abs() > tolerance_nanos.as_nanos() as i64 { + client + .sync_clock(with_timeout(Duration::from_secs(1).as_nanos() as i64), &p) + .await + .map_err(|e| anyhow!("set delta: {:?}", e))?; + } + Ok(()) +} + +// delta = ((c_send - c_arrive) + (s_arrive - s_send)) / 2 +fn checked_compute_delta(c_send: i64, c_arrive: i64, s_send: i64, s_arrive: i64) -> Result { + let delta_client = c_send + .checked_sub(c_arrive) + .ok_or_else(|| anyhow!("integer overflow {} - {}", c_send, c_arrive))?; + + let delta_server = s_arrive + .checked_sub(s_send) + .ok_or_else(|| anyhow!("integer overflow {} - {}", s_arrive, s_send))?; + + let delta_sum = delta_client + .checked_add(delta_server) + .ok_or_else(|| anyhow!("integer overflow {} + {}", delta_client, delta_server))?; + + let delta = delta_sum + .checked_div(2) + .ok_or_else(|| anyhow!("integer overflow {} / 2", delta_sum))?; + + Ok(delta) +} + #[cfg(test)] mod tests { - use crate::client::new_ttrpc_client_with_timeout; + use crate::client::{checked_compute_delta, new_ttrpc_client_with_timeout}; #[tokio::test] async fn test_new_ttrpc_client_timeout() { @@ -319,4 +355,16 @@ mod tests { .await .is_err()); } + + #[test] + fn test_checked_compute_delta() { + let c_send = 231; + let c_arrive = 135; + let s_send = 137; + let s_arrive = 298; + + let expect_delta = 128; + let actual_delta = checked_compute_delta(c_send, c_arrive, s_send, s_arrive).unwrap(); + assert_eq!(expect_delta, actual_delta); + } } diff --git a/vmm/sandbox/src/sandbox.rs b/vmm/sandbox/src/sandbox.rs index 0105b098..4808e249 100644 --- a/vmm/sandbox/src/sandbox.rs +++ b/vmm/sandbox/src/sandbox.rs @@ -535,7 +535,7 @@ where pub(crate) async fn sync_clock(&self) { let client_guard = self.client.lock().await; if let Some(client) = &*client_guard { - client_sync_clock(client, self.id.as_str()).await; + client_sync_clock(client, self.id.as_str(), self.exit_signal.clone()); } } From caeb01c48e3200254a2f26506a4cc4059cf62130 Mon Sep 17 00:00:00 2001 From: Zhang Tianyang Date: Fri, 5 Apr 2024 16:25:09 +0800 Subject: [PATCH 2/2] bugfix: residual fds of netlink socket and tun Signed-off-by: Zhang Tianyang --- vmm/sandbox/src/cloud_hypervisor/mod.rs | 31 ++++++------ vmm/sandbox/src/device.rs | 20 +++----- vmm/sandbox/src/network/link.rs | 7 +-- vmm/sandbox/src/qemu/devices/vsock.rs | 8 +-- vmm/sandbox/src/qemu/mod.rs | 29 ++++++----- vmm/sandbox/src/stratovirt/devices/vsock.rs | 8 +-- vmm/sandbox/src/stratovirt/mod.rs | 27 ++++++----- vmm/sandbox/src/utils.rs | 54 ++++++++++++++------- 8 files changed, 102 insertions(+), 82 deletions(-) diff --git a/vmm/sandbox/src/cloud_hypervisor/mod.rs b/vmm/sandbox/src/cloud_hypervisor/mod.rs index 8eeaef27..ac422d79 100644 --- a/vmm/sandbox/src/cloud_hypervisor/mod.rs +++ b/vmm/sandbox/src/cloud_hypervisor/mod.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::{os::unix::io::RawFd, process::Stdio, time::Duration}; +use std::{os::fd::OwnedFd, process::Stdio, time::Duration}; use anyhow::anyhow; use async_trait::async_trait; @@ -72,7 +72,8 @@ pub struct CloudHypervisorVM { wait_chan: Option>, #[serde(skip)] client: Option, - fds: Vec, + #[serde(skip)] + fds: Vec, pids: Pids, } @@ -142,7 +143,7 @@ impl CloudHypervisorVM { Ok(pid) } - fn append_fd(&mut self, fd: RawFd) -> usize { + fn append_fd(&mut self, fd: OwnedFd) -> usize { self.fds.push(fd); self.fds.len() - 1 + 3 } @@ -175,17 +176,19 @@ impl VM for CloudHypervisorVM { params.push("-vv".to_string()); } - let mut cmd = tokio::process::Command::new(&self.config.path); - cmd.args(params.as_slice()); - - set_cmd_fd(&mut cmd, self.fds.to_vec())?; - set_cmd_netns(&mut cmd, self.netns.to_string())?; - cmd.stdout(Stdio::piped()); - cmd.stderr(Stdio::piped()); - info!("start cloud hypervisor with cmdline: {:?}", cmd); - let child = cmd - .spawn() - .map_err(|e| anyhow!("failed to spawn cloud hypervisor command: {}", e))?; + // Drop cmd immediately to let the fds in pre_exec be closed. + let child = { + let mut cmd = tokio::process::Command::new(&self.config.path); + cmd.args(params.as_slice()); + + set_cmd_fd(&mut cmd, self.fds.drain(..).collect())?; + set_cmd_netns(&mut cmd, self.netns.to_string())?; + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + info!("start cloud hypervisor with cmdline: {:?}", cmd); + cmd.spawn() + .map_err(|e| anyhow!("failed to spawn cloud hypervisor command: {}", e))? + }; let pid = child.id(); self.pids.vmm_pid = pid; let pid_file = format!("{}/pid", self.base_dir); diff --git a/vmm/sandbox/src/device.rs b/vmm/sandbox/src/device.rs index 09d39e8f..38fd8a3e 100644 --- a/vmm/sandbox/src/device.rs +++ b/vmm/sandbox/src/device.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::os::unix::io::RawFd; +use std::os::fd::OwnedFd; use containerd_sandbox::error::{Error, Result}; @@ -182,41 +182,38 @@ impl Transport { } } -#[derive(Debug, Clone)] +#[derive(Debug)] pub enum DeviceInfo { Block(BlockDeviceInfo), - #[allow(dead_code)] Tap(TapDeviceInfo), - #[allow(dead_code)] Physical(PhysicalDeviceInfo), - #[allow(dead_code)] VhostUser(VhostUserDeviceInfo), Char(CharDeviceInfo), } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct BlockDeviceInfo { pub id: String, pub path: String, pub read_only: bool, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct TapDeviceInfo { pub id: String, pub index: u32, pub name: String, pub mac_address: String, - pub fds: Vec, + pub fds: Vec, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct PhysicalDeviceInfo { pub id: String, pub bdf: String, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct VhostUserDeviceInfo { pub id: String, pub socket_path: String, @@ -224,7 +221,7 @@ pub struct VhostUserDeviceInfo { pub r#type: String, } -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct CharDeviceInfo { pub id: String, pub chardev_id: String, @@ -235,6 +232,5 @@ pub struct CharDeviceInfo { #[derive(Debug, Clone, Eq, PartialEq)] pub enum CharBackendType { Pipe(String), - #[allow(dead_code)] Socket(String), } diff --git a/vmm/sandbox/src/network/link.rs b/vmm/sandbox/src/network/link.rs index 5ddc2785..c33cdad2 100644 --- a/vmm/sandbox/src/network/link.rs +++ b/vmm/sandbox/src/network/link.rs @@ -324,11 +324,11 @@ impl NetworkInterface { Ok(()) } - pub async fn attach_to(&self, sandbox: &mut KuasarSandbox) -> Result<()> { + pub async fn attach_to(&mut self, sandbox: &mut KuasarSandbox) -> Result<()> { let id = format!("intf-{}", self.index); match &self.r#type { LinkType::Veth => { - if let Some(intf) = &self.twin { + if let Some(intf) = self.twin.as_mut() { sandbox .vm .attach(DeviceInfo::Tap(TapDeviceInfo { @@ -336,7 +336,7 @@ impl NetworkInterface { index: self.index, name: intf.name.to_string(), mac_address: self.mac_address.to_string(), - fds: intf.fds.iter().map(|fd| fd.as_raw_fd()).collect(), + fds: intf.fds.drain(..).collect(), })) .await?; } else { @@ -470,6 +470,7 @@ fn get_bdf_for_eth(if_name: &str) -> Result { e ) })?; + close(sock).unwrap_or_default(); Ok(bdf.to_string()) } diff --git a/vmm/sandbox/src/qemu/devices/vsock.rs b/vmm/sandbox/src/qemu/devices/vsock.rs index 7833dedf..6bc13f11 100644 --- a/vmm/sandbox/src/qemu/devices/vsock.rs +++ b/vmm/sandbox/src/qemu/devices/vsock.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::os::unix::io::{IntoRawFd, RawFd}; +use std::os::fd::{AsRawFd, OwnedFd}; use anyhow::anyhow; use containerd_sandbox::error::Result; @@ -60,7 +60,7 @@ impl VSockDevice { } } -pub async fn find_context_id() -> Result<(RawFd, u64)> { +pub async fn find_context_id() -> Result<(OwnedFd, u64)> { // TODO make sure if this thread_rng is enough, if we should new a seedable rng everytime. let vsock_file = tokio::fs::OpenOptions::new() .read(true) @@ -68,10 +68,10 @@ pub async fn find_context_id() -> Result<(RawFd, u64)> { .mode(0o666) .open(VHOST_VSOCK_DEV_PATH) .await?; - let vsockfd = vsock_file.into_std().await.into_raw_fd(); + let vsockfd = OwnedFd::from(vsock_file.into_std().await); for _i in 0..IOCTL_TRY_TIMES { let cid = thread_rng().gen_range(3..i32::MAX as u64); - let res = unsafe { set_vhost_guest_cid(vsockfd, &cid) }; + let res = unsafe { set_vhost_guest_cid(vsockfd.as_raw_fd(), &cid) }; match res { Ok(_) => return Ok((vsockfd, cid)), Err(_) => continue, diff --git a/vmm/sandbox/src/qemu/mod.rs b/vmm/sandbox/src/qemu/mod.rs index 9b8ddcec..855457eb 100644 --- a/vmm/sandbox/src/qemu/mod.rs +++ b/vmm/sandbox/src/qemu/mod.rs @@ -16,7 +16,10 @@ limitations under the License. use std::{ collections::HashMap, - os::unix::io::{AsRawFd, FromRawFd, RawFd}, + os::{ + fd::OwnedFd, + unix::io::{AsRawFd, FromRawFd, RawFd}, + }, time::{Duration, SystemTime}, }; @@ -82,7 +85,8 @@ pub struct QemuVM { devices: Vec>, #[serde(skip)] hot_attached_devices: Vec>, - fds: Vec, + #[serde(skip)] + fds: Vec, console_socket: String, agent_socket: String, netns: String, @@ -101,8 +105,6 @@ impl VM for QemuVM { debug!("start vm {}", self.id); let wait_chan = self.launch().await?; self.wait_chan = Some(wait_chan); - // close the fds after launch qemu - self.fds = vec![]; let start_time = SystemTime::now(); loop { match self.create_client().await { @@ -342,17 +344,17 @@ impl QemuVM { self.devices.push(Box::new(device)); } - fn append_fd(&mut self, fd: RawFd) -> usize { + fn append_fd(&mut self, fd: OwnedFd) -> usize { self.fds.push(fd); self.fds.len() - 1 + 3 } - async fn launch(&self) -> Result> { + async fn launch(&mut self) -> Result> { let mut params = self.config.to_cmdline_params("-"); for d in self.devices.iter() { params.extend(d.to_cmdline_params("-")); } - let fds = self.fds.to_vec(); + let fds: Vec = self.fds.drain(..).collect(); let path = self.config.path.to_string(); // pid file should not be empty let pid_file = self.config.pid_file.to_string(); @@ -367,20 +369,17 @@ impl QemuVM { spawn_blocking(move || -> Result<()> { let mut cmd = unshare::Command::new(&*path_clone); cmd.args(params.as_slice()); - for (i, &x) in fds.iter().enumerate() { - cmd.file_descriptor( - (3 + i) as RawFd, - Fd::from_file(unsafe { std::fs::File::from_raw_fd(x) }), - ); + let pipe_writer2 = pipe_writer.try_clone()?; + cmd.stdout(unshare::Stdio::from_file(pipe_writer)); + cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); + for (i, x) in fds.into_iter().enumerate() { + cmd.file_descriptor((3 + i) as RawFd, Fd::from_file(std::fs::File::from(x))); } if !netns.is_empty() { let netns_fd = nix::fcntl::open(&*netns, OFlag::O_CLOEXEC, Mode::empty()) .map_err(|e| anyhow!("failed to open netns {}", e))?; cmd.set_namespace(&netns_fd, unshare::Namespace::Net)?; } - let pipe_writer2 = pipe_writer.try_clone()?; - cmd.stdout(unshare::Stdio::from_file(pipe_writer)); - cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); let mut child = cmd .spawn() .map_err(|e| anyhow!("failed to spawn qemu command: {}", e))?; diff --git a/vmm/sandbox/src/stratovirt/devices/vsock.rs b/vmm/sandbox/src/stratovirt/devices/vsock.rs index 2239f011..f441fc9f 100644 --- a/vmm/sandbox/src/stratovirt/devices/vsock.rs +++ b/vmm/sandbox/src/stratovirt/devices/vsock.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -use std::os::unix::io::{IntoRawFd, RawFd}; +use std::os::fd::{AsRawFd, OwnedFd}; use anyhow::anyhow; use containerd_sandbox::error::Result; @@ -62,7 +62,7 @@ impl VSockDevice { } } -pub async fn find_context_id() -> Result<(RawFd, u64)> { +pub async fn find_context_id() -> Result<(OwnedFd, u64)> { // TODO make sure if this thread_rng is enough, if we should new a seedable rng everytime. let vsock_file = tokio::fs::OpenOptions::new() .read(true) @@ -70,10 +70,10 @@ pub async fn find_context_id() -> Result<(RawFd, u64)> { .mode(0o666) .open(VHOST_VSOCK_DEV_PATH) .await?; - let vsockfd = vsock_file.into_std().await.into_raw_fd(); + let vsockfd = OwnedFd::from(vsock_file.into_std().await); for _i in 0..IOCTL_TRY_TIMES { let cid = thread_rng().gen_range(3..i32::MAX as u64); - let res = unsafe { set_vhost_guest_cid(vsockfd, &cid) }; + let res = unsafe { set_vhost_guest_cid(vsockfd.as_raw_fd(), &cid) }; match res { Ok(_) => return Ok((vsockfd, cid)), Err(_) => continue, diff --git a/vmm/sandbox/src/stratovirt/mod.rs b/vmm/sandbox/src/stratovirt/mod.rs index 787cfff1..14e0620c 100644 --- a/vmm/sandbox/src/stratovirt/mod.rs +++ b/vmm/sandbox/src/stratovirt/mod.rs @@ -16,7 +16,10 @@ limitations under the License. use std::{ collections::HashMap, - os::unix::io::{AsRawFd, FromRawFd, RawFd}, + os::{ + fd::OwnedFd, + unix::io::{AsRawFd, FromRawFd, RawFd}, + }, time::{Duration, SystemTime}, }; @@ -87,7 +90,8 @@ pub struct StratoVirtVM { devices: Vec>, #[serde(skip)] hot_attached_devices: Vec>, - fds: Vec, + #[serde(skip)] + fds: Vec, console_socket: String, agent_socket: String, netns: String, @@ -309,17 +313,17 @@ impl StratoVirtVM { self.devices.push(Box::new(device)); } - fn append_fd(&mut self, fd: RawFd) -> usize { + fn append_fd(&mut self, fd: OwnedFd) -> usize { self.fds.push(fd); self.fds.len() - 1 + 3 } - async fn launch(&self) -> Result> { + async fn launch(&mut self) -> Result> { let mut params = self.config.to_cmdline_params("-"); for d in self.devices.iter() { params.extend(d.to_cmdline_params("-")); } - let fds = self.fds.to_vec(); + let fds: Vec = self.fds.drain(..).collect(); let path = self.config.path.to_string(); // pid file should not be empty let pid_file = self.config.pid_file.to_string(); @@ -335,11 +339,11 @@ impl StratoVirtVM { let mut cmd = unshare::Command::new(&*path_clone); cmd.args(params.as_slice()); - for (i, &x) in fds.iter().enumerate() { - cmd.file_descriptor( - (3 + i) as RawFd, - Fd::from_file(unsafe { std::fs::File::from_raw_fd(x) }), - ); + let pipe_writer2 = pipe_writer.try_clone()?; + cmd.stdout(unshare::Stdio::from_file(pipe_writer)); + cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); + for (i, x) in fds.into_iter().enumerate() { + cmd.file_descriptor((3 + i) as RawFd, Fd::from_file(std::fs::File::from(x))); } if !netns.is_empty() { @@ -347,9 +351,6 @@ impl StratoVirtVM { .map_err(|e| anyhow!("failed to open netns {}", e))?; cmd.set_namespace(&netns_fd, unshare::Namespace::Net)?; } - let pipe_writer2 = pipe_writer.try_clone()?; - cmd.stdout(unshare::Stdio::from_file(pipe_writer)); - cmd.stderr(unshare::Stdio::from_file(pipe_writer2)); let mut child = cmd .spawn() .map_err(|e| anyhow!("failed to spawn stratovirt command: {}", e))?; diff --git a/vmm/sandbox/src/utils.rs b/vmm/sandbox/src/utils.rs index 37f11261..729473bf 100644 --- a/vmm/sandbox/src/utils.rs +++ b/vmm/sandbox/src/utils.rs @@ -15,9 +15,13 @@ limitations under the License. */ use std::{ - os::unix::{ - io::RawFd, - prelude::{AsRawFd, FromRawFd, OwnedFd}, + mem, + os::{ + fd::IntoRawFd, + unix::{ + io::RawFd, + prelude::{AsRawFd, FromRawFd, OwnedFd}, + }, }, path::Path, str::FromStr, @@ -32,10 +36,11 @@ use containerd_sandbox::{ }; use log::{error, info}; use nix::{ - fcntl::{open, OFlag}, - libc::{dup2, fcntl, kill, setns, FD_CLOEXEC, F_GETFD, F_SETFD}, + fcntl::{fcntl, open, FdFlag, OFlag, F_GETFD, F_SETFD}, + libc::{kill, setns, FD_CLOEXEC}, sched::CloneFlags, sys::stat::Mode, + unistd::dup2, }; use time::OffsetDateTime; use tokio::{ @@ -450,24 +455,39 @@ pub fn set_cmd_netns(cmd: &mut Command, netns: String) -> Result<()> { Ok(()) } -pub fn set_cmd_fd(cmd: &mut Command, fds: Vec) -> Result<()> { +pub fn set_cmd_fd(cmd: &mut Command, mut fds: Vec) -> Result<()> { unsafe { cmd.pre_exec(move || { - for (i, &fd) in fds.iter().enumerate() { - let dest_fd = (3 + i) as RawFd; - let src_fd = fd; - - if src_fd == dest_fd { - let flags = fcntl(src_fd, F_GETFD); - if flags < 0 || fcntl(src_fd, F_SETFD, flags & !FD_CLOEXEC) < 0 { + for (i, fd) in mem::take(&mut fds).into_iter().enumerate() { + let new_fd = (3 + i) as RawFd; + + // Closing the fd when its lifecycle finished is unsafe, so transfer it into RawFD + // to let its closing not be influenced by rust lifecycle management. + let old_fd = fd.into_raw_fd(); + + if old_fd == new_fd { + // old_fd equals new_fd means the index is in the right place, so child process + // could used it directly. In this case, should remove CLOEXEC flag to avoid + // closing it after execve. + let flags = fcntl(old_fd, F_GETFD)?; + if flags < 0 { let e = std::io::Error::last_os_error(); - eprintln!("failed to call fnctl: {}", e); + eprintln!("failed to get fnctl F_GETFD: {}", e); return Err(e); + } else if let Err(e) = fcntl( + old_fd, + F_SETFD(FdFlag::from_bits_truncate(flags & !FD_CLOEXEC)), + ) { + eprintln!("failed to call fnctl F_SETFD: {}", e); + return Err(e.into()); } - } else if dup2(src_fd, dest_fd) < 0 { - let e = std::io::Error::last_os_error(); + } else if let Err(e) = dup2(old_fd, new_fd) { + // If not equals, old_fd will be closed after execve with CLOEXEC flag, + // which is also safe. eprintln!("failed to call dup2: {}", e); - return Err(e); + return Err(e.into()); + } else { + // dup2 succeeds, do nothing } } Ok(())