//
// Syd: rock-solid application kernel
// src/namespace.rs: Namespace utilities
//
// Copyright (c) 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

//! Set of functions to manage namespaces

use std::{
    borrow::Cow,
    env,
    ffi::{CStr, OsStr},
    net::{IpAddr, SocketAddrV4, SocketAddrV6},
    os::{
        fd::{AsFd, AsRawFd, OwnedFd},
        unix::{ffi::OsStrExt, net::UnixStream, process::CommandExt},
    },
    process::{Command, Stdio},
};

use libc::setdomainname;
use memchr::arch::all::is_equal;
use nix::{
    errno::Errno,
    fcntl::OFlag,
    mount::{umount2, MntFlags, MsFlags},
    sys::{
        signal::Signal,
        socket::{bind, socket, AddressFamily, SockFlag, SockType, SockaddrIn, SockaddrIn6},
        stat::{mkdirat, mknodat, umask, Mode, SFlag},
    },
    time::{clock_gettime, ClockId},
    unistd::{chdir, fchdir, pivot_root, sethostname, setsid, symlinkat, Gid, Pid, Uid},
    NixPath,
};

use crate::{
    caps::{securebits::set_keepcaps, CapSet},
    compat::{openat2, set_no_new_privs, set_pdeathsig, ResolveFlag},
    config::{HAVE_NAMESPACED_PID_MAX, NPROC},
    confine::confine_landlock_scope,
    err::{err2no, SydResult},
    error,
    fd::{is_dev_null, pidfd_open, send_with_fd, set_cloexec, AT_BADFD},
    info, log_enabled,
    lookup::{file_type, safe_mkdir_all, safe_open_how},
    mount::{
        api::MountAttrFlags,
        util::{mount_bind, mount_fs, set_root_mount_propagation},
    },
    path::{XPath, XPathBuf},
    proc::{proc_map_user, proc_pid_max, proc_set_time},
    retry::retry_on_eintr,
    sandbox::BindMount,
    syslog::LogLevel,
    warn,
};

/// Set up user namespace.
pub fn ns_setup_user<Fd: AsFd>(fd_proc: Fd, uid: Uid, gid: Gid, map_root: bool) -> SydResult<()> {
    // Write uid/gid map for user namespace.
    proc_map_user(fd_proc, uid, gid, map_root).inspect_err(|errno| {
        error!("ctx": "setup_user_namespace", "op": "map_user",
            "uid": uid.as_raw(), "gid": gid.as_raw(),
            "map_root": map_root, "err": *errno as i32,
            "msg": format!("set up uid/gid mapping for user namespace failed: {errno}"),
            "tip": "configure your system to allow user namespaces");
    })?;

    // Set inheritable mask and ambient caps to retain caps after execve(2).
    set_keepcaps(true)?;
    let permitted_caps = crate::caps::read(None, CapSet::Permitted)?;
    crate::caps::set(None, CapSet::Inheritable, permitted_caps)?;
    // Set the same capabilities as ambient, if necessary.
    for flag in permitted_caps {
        let cap = flag.try_into()?;
        crate::caps::raise(None, CapSet::Ambient, cap)?;
    }

    Ok(())
}

/// Set time offsets in new time namespace.
pub fn ns_setup_time<Fd: AsFd>(
    fd_proc: Fd,
    boottime: Option<i64>,
    monotime: Option<i64>,
) -> SydResult<()> {
    let boffset = if let Some(boffset) = boottime {
        boffset
    } else {
        let btime = clock_gettime(ClockId::CLOCK_BOOTTIME)?;
        // Into is necessary on 32-bit.
        #[expect(clippy::useless_conversion)]
        btime.tv_sec().checked_neg().ok_or(Errno::EOVERFLOW)?.into()
    };
    let moffset = if let Some(moffset) = monotime {
        moffset
    } else {
        let mtime = clock_gettime(ClockId::CLOCK_MONOTONIC)?;
        // Into is necessary on 32-bit.
        #[expect(clippy::useless_conversion)]
        mtime.tv_sec().checked_neg().ok_or(Errno::EOVERFLOW)?.into()
    };

    if boffset != 0 || moffset != 0 {
        proc_set_time(fd_proc, boffset, moffset).inspect_err(|errno| {
            error!("ctx": "setup_time_namespace", "op": "set_boot_time",
                "err": *errno as i32,
                "msg": format!("set boot time in time namespace failed: {errno}"),
                "tip": "configure your system to allow unprivileged user namespaces");
        })?;
        info!("ctx": "setup_time_namespace", "op": "set_boot_time",
            "timens_offsets": [moffset, boffset],
            "msg": "set boot time in time namespace");
    }

    Ok(())
}

/// Set host/domain name in UTS namespace.
pub fn ns_setup_uts(hostname: Option<&CStr>, domainname: Option<&CStr>) -> SydResult<()> {
    if let Some(domainname) = domainname {
        let domainsize = domainname.to_bytes().len(); // without NUL.
        let domainname = domainname.as_ptr().cast();

        // SAFETY: No setdomainname(2) wrapper in nix yet.
        Errno::result(unsafe { setdomainname(domainname, domainsize) }).inspect_err(|errno| {
            error!("ctx": "setup_uts_namespace", "op": "set_domain_name",
                "msg": format!("set NIS/YP domain name failed: {errno}"),
                "tip": "configure `uts/domain:none'",
                "err": *errno as i32);
        })?;
    }

    if let Some(hostname) = hostname.as_ref() {
        let hostname = OsStr::from_bytes(hostname.to_bytes());
        sethostname(hostname).inspect_err(|errno| {
            error!("ctx": "setup_uts_namespace", "op": "set_host_name",
                "msg": format!("set host name failed: {errno}"),
                "tip": "configure `uts/host:none'",
                "err": *errno as i32);
        })?;
    }

    Ok(())
}

/// Set up pid namespace.
pub fn ns_setup_pid<Fd: AsFd>(fd_proc: Fd, pid_max: u64) -> SydResult<()> {
    // Set namespaced kernel.pid_max sysctl:
    // 1. The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
    // 2. Kernel adapts reserved pids based on number of CPUs on the system.
    // 3. We cannot do this after private procfs mount due to subset=pid.
    if !*HAVE_NAMESPACED_PID_MAX {
        return Ok(());
    }

    const PIDS_PER_CPU_MIN: u64 = 8;
    const RESERVED_PIDS: u64 = if cfg!(target_arch = "s390x") {
        // TODO: Figure out where this limit on s390x comes from and document.
        512
    } else {
        301
    };

    #[expect(clippy::arithmetic_side_effects)]
    let pid_max = pid_max
        .max(RESERVED_PIDS)
        .max(PIDS_PER_CPU_MIN * (*NPROC as u64));

    proc_pid_max(fd_proc, pid_max)?;
    info!("ctx": "setup_pid_namespace", "op": "set_pid_max", "max": pid_max,
        "msg": format!("set namespaced kernel.pid_max sysctl to {pid_max}"));

    Ok(())
}

/// Set up mount namespace (after fork).
#[expect(clippy::cognitive_complexity)]
pub fn ns_setup_mnt(
    root: Option<&XPath>,
    bind_mounts: Option<&[BindMount]>,
    restrict_proc_files: bool,
) -> SydResult<()> {
    let how = safe_open_how(OFlag::O_PATH | OFlag::O_NOFOLLOW, ResolveFlag::empty())
        // Drop RESOLVE_BENEATH which we cannot use here.
        .resolve(ResolveFlag::RESOLVE_NO_MAGICLINKS | ResolveFlag::RESOLVE_NO_SYMLINKS);
    let how_dir =
        how.flags(OFlag::O_CLOEXEC | OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_DIRECTORY);
    let how_xdev = safe_open_how(
        OFlag::O_PATH | OFlag::O_NOFOLLOW,
        ResolveFlag::RESOLVE_NO_XDEV,
    );
    let how_xdev_dir =
        how_xdev.flags(OFlag::O_CLOEXEC | OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_DIRECTORY);
    let how_xdev_new = how_xdev
        .flags(OFlag::O_CLOEXEC | OFlag::O_RDONLY | OFlag::O_CREAT | OFlag::O_EXCL)
        .mode(Mode::S_IRUSR);

    // Set root mount propagation to private recursively.
    set_root_mount_propagation(MsFlags::MS_PRIVATE)?;

    let (root_fd, root_is_tmpfs) = if let Some(root) = root {
        let root_tmp = if root.is_equal(b"ramfs") {
            Some("ramfs")
        } else if root.is_equal(b"tmpfs") {
            Some("tmpfs")
        } else {
            None
        };
        let root = if root_tmp.is_some() {
            // Use proc_pid_fdinfo(5) as private root directory.
            // This avoids the need for a temporary directory.
            // Syd requires proc(5) so this isn't an added dependency.
            let mut pfd = XPathBuf::from("/proc");
            pfd.push_pid(Pid::this());
            pfd.push(b"fdinfo");
            Cow::Owned(pfd)
        } else {
            // Use specified new-root directory.
            Cow::Borrowed(root)
        };

        #[expect(clippy::disallowed_methods)]
        let mut fd = retry_on_eintr(|| openat2(AT_BADFD, root.as_ref(), how_dir))?;

        #[expect(clippy::disallowed_methods)]
        if let Some(root_tmp) = root_tmp {
            // Mount tmpfs over root filesystem with default options.
            // This also ensures rootfs is a mountpoint which is required
            // by pivot_root(2).
            mount_fs(
                OsStr::new(root_tmp),
                fd,
                MountAttrFlags::MOUNT_ATTR_NOSUID
                    | MountAttrFlags::MOUNT_ATTR_NODEV
                    | MountAttrFlags::MOUNT_ATTR_NOEXEC
                    | MountAttrFlags::MOUNT_ATTR_NOATIME
                    | MountAttrFlags::MOUNT_ATTR_NOSYMFOLLOW,
                Some("mode=700"),
            )?;

            // Reopen rootfd after rootfs mount.
            fd = retry_on_eintr(|| openat2(AT_BADFD, root.as_ref(), how_dir))?;

            // Ensure safe CWD.
            // This is important because we may recursively
            // create directories later on in this directory.
            // This point is too early for safe_chdir,
            // here we run without confinement.
            fchdir(&fd)?;

            // Create /dev and /proc directories.
            mkdirat(&fd, "dev", Mode::S_IRWXU)?;
            mkdirat(&fd, "proc", Mode::S_IRWXU)?;
        } else {
            // Make new root directory a mountpoint with a self-bind.
            // This is required by pivot_root(2).
            mount_bind(&fd, &fd, MountAttrFlags::empty())?;

            // Reopen rootfd after rootfs mount.
            drop(fd);
            fd = retry_on_eintr(|| openat2(AT_BADFD, root.as_ref(), how_dir))?;

            // Mount /dev tmpfs with default options.
            // Do it early here so the user can populate it.
            #[expect(clippy::disallowed_methods)]
            let fd = retry_on_eintr(|| openat2(&fd, "dev", how_dir))?;

            mount_fs(
                OsStr::new("tmpfs"),
                fd,
                MountAttrFlags::MOUNT_ATTR_NOSUID
                    | MountAttrFlags::MOUNT_ATTR_NOEXEC
                    | MountAttrFlags::MOUNT_ATTR_NOATIME
                    | MountAttrFlags::MOUNT_ATTR_NOSYMFOLLOW,
                Some("mode=700"),
            )?;
        }

        (Some(fd), root_tmp.is_some())
    } else {
        (None, false)
    };

    // Process bind mounts as necessary.
    if let Some(bind_mounts) = bind_mounts {
        for bind in bind_mounts {
            // SAFETY: Ensure root cannot be mounted over.
            if root_fd.is_some() && bind.dst.is_rootfs() {
                let errno = Errno::EINVAL;
                error!("ctx": "setup_mount_namespace", "op": "spec_mount", "mnt": &bind, "err": errno as i32,
                    "msg": format!("mount over rootfs is not permitted: {errno}"));
                return Err(errno.into());
            }

            if bind.src.is_relative() {
                // Note `bind.dst` is always an absolute path.
                let dst_fd = if let Some(ref root_fd) = root_fd {
                    // SAFETY: Ensure no consecutive slashes exist.
                    let mut dst = bind.dst.clone();
                    dst.clean_consecutive_slashes();
                    dst.remove(0); // Turn into relative path.

                    if root_is_tmpfs && !dst.is_empty() {
                        // Create directories recursively under temporary root.
                        // SAFETY:
                        // 1. `dst` is relative.
                        // 2. `dst` has no `..` components.
                        // 3. Uses fd-based mkdirat+O_NOFOLLOW to prevent symlink attacks.
                        safe_mkdir_all(root_fd, &dst)
                    } else {
                        #[expect(clippy::disallowed_methods)]
                        retry_on_eintr(|| openat2(root_fd, &dst, how_xdev_dir))
                    }
                } else {
                    #[expect(clippy::disallowed_methods)]
                    retry_on_eintr(|| openat2(AT_BADFD, &bind.dst, how_dir))
                }?;

                match mount_fs(bind.src.as_os_str(), dst_fd, bind.opt, bind.dat.as_deref()) {
                    Ok(_) => {
                        info!("ctx": "setup_mount_namespace", "op": "spec_mount", "mnt": &bind,
                            "msg": format!("mounted special-fs `{bind}'"));
                    }
                    Err(errno) => {
                        error!("ctx": "setup_mount_namespace", "op": "spec_mount", "mnt": &bind, "err": errno as i32,
                            "msg": format!("mount special-fs `{bind}' failed: {errno}"));
                        return Err(errno.into());
                    }
                }
            } else {
                #[expect(clippy::disallowed_methods)]
                let src_fd = retry_on_eintr(|| openat2(AT_BADFD, &bind.src, how))?;
                let is_dir = file_type(&src_fd, None, false)?.is_dir();

                // Note `bind.dst` is always an absolute path.
                let dst_fd = if let Some(ref root_fd) = root_fd {
                    // SAFETY: Ensure no consecutive slashes exist.
                    let mut dst = bind.dst.clone();
                    dst.clean_consecutive_slashes();
                    dst.remove(0); // Turn into relative path.
                    let empty = dst.is_empty();

                    // Create file or directory under temporary root.
                    if root_is_tmpfs && !empty {
                        if is_dir {
                            // Create all directories (including target) safely.
                            // SAFETY:
                            // 1. `dst` is relative.
                            // 2. `dst` has no `..` components.
                            // 3. Uses fd-based mkdirat+O_NOFOLLOW per component.
                            safe_mkdir_all(root_fd, &dst).map(drop)?;
                        } else {
                            // Create parent directories safely, then create
                            // the file target using the parent fd to avoid
                            // multi-component paths in mknodat(2).
                            let (parent, base) = dst.split();
                            if !parent.is_empty() {
                                safe_mkdir_all(root_fd, parent).and_then(|fd| {
                                    mknodat(fd, base, SFlag::S_IFREG, Mode::S_IRUSR, 0)
                                })?;
                            } else {
                                // No parent: Use root_fd directly.
                                mknodat(root_fd, base, SFlag::S_IFREG, Mode::S_IRUSR, 0)?;
                            }
                        }
                    }

                    // Ensure we open current directory for `/`.
                    if dst.is_empty() {
                        dst.append_byte(b'.');
                    }

                    #[expect(clippy::disallowed_methods)]
                    retry_on_eintr(|| {
                        openat2(root_fd, &dst, if is_dir { how_xdev_dir } else { how_xdev })
                    })
                } else {
                    #[expect(clippy::disallowed_methods)]
                    retry_on_eintr(|| {
                        openat2(AT_BADFD, &bind.dst, if is_dir { how_dir } else { how })
                    })
                }?;

                // Perform recursive bind mount.
                match mount_bind(src_fd, dst_fd, bind.opt) {
                    Ok(_) => {
                        info!("ctx": "setup_mount_namespace", "op": "bind_mount", "mnt": &bind,
                            "msg": format!("bind mounted `{bind}'"));
                    }
                    Err(errno) => {
                        error!("ctx": "setup_mount_namespace", "op": "bind_mount", "mnt": &bind, "err": errno as i32,
                            "msg": format!("bind mount `{bind}' failed: {errno}"));
                        return Err(errno.into());
                    }
                }
            }
        }
    }

    // Unshare/Mount implies Unshare/PID.
    //
    // Mount private procfs.
    //
    // The target directory may be under the chroot directory.
    // Use hidepid=2 to hide pid=1.
    // As of version 3.37.2 we use hidepid=4 which is Linux>=5.8.
    // As of version 3.39.0 we use subset=pid which is Linux>=5.8.
    //
    // SAFETY: Private procfs is mounted _after_ custom bind mounts
    // to ensure they cannot interfere with this mount.
    #[expect(clippy::disallowed_methods)]
    let proc_fd = if let Some(ref root_fd) = root_fd {
        retry_on_eintr(|| openat2(root_fd, "proc", how_xdev_dir))
    } else {
        retry_on_eintr(|| openat2(AT_BADFD, "/proc", how_dir))
    }?;

    let flags = MountAttrFlags::MOUNT_ATTR_NOSUID
        | MountAttrFlags::MOUNT_ATTR_NOEXEC
        | MountAttrFlags::MOUNT_ATTR_NODEV;
    let mut opts = "hidepid=4".to_string();
    if restrict_proc_files {
        opts.push_str(",subset=pid");
    }
    mount_fs(OsStr::new("proc"), proc_fd, flags, Some(opts.as_str()))?;
    if log_enabled!(LogLevel::Info) {
        let bind = BindMount {
            src: "proc".into(),
            dst: "/proc".into(),
            opt: flags,
            dat: Some(opts),
        };
        info!("ctx": "setup_mount_namespace", "op": "mount_procfs", "mnt": &bind,
            "msg": format!("mounted procfs `{bind}'"));
    }

    if let Some(ref root_fd) = root_fd {
        // Provide /dev/null which is required by Syd.
        #[expect(clippy::disallowed_methods)]
        let src_fd = retry_on_eintr(|| openat2(AT_BADFD, "/dev/null", how))?;

        // SAFETY: Validate what we've opened is indeed `/dev/null`.
        if !is_dev_null(&src_fd).unwrap_or(false) {
            let errno = Errno::ENODEV;
            error!("ctx": "setup_mount_namespace", "op": "null_mount", "err": errno as i32,
                "msg": format!("/dev/null is not a character device"));
            return Err(errno.into());
        }

        #[expect(clippy::disallowed_methods)]
        let dev_fd = retry_on_eintr(|| {
            openat2(
                root_fd,
                "dev",
                if root_is_tmpfs { how_xdev_dir } else { how_dir },
            )
        })?;

        #[expect(clippy::disallowed_methods)]
        let dst_fd = retry_on_eintr(|| openat2(&dev_fd, "null", how_xdev_new))?;

        // Perform recursive bind mount.
        mount_bind(src_fd, dst_fd, MountAttrFlags::empty())?;

        // Provide symbolic links for standard file descriptors.
        //
        // Note, these symbolic links are user-owned so the sandbox
        // process may remove them. Since these symbolic links are
        // only for convenience, we do not check for errors.
        //
        // /dev/stdin
        let _ = symlinkat("/proc/thread-self/fd/0", &dev_fd, "stdin");
        // /dev/stdout
        let _ = symlinkat("/proc/thread-self/fd/1", &dev_fd, "stdout");
        // /dev/stderr
        let _ = symlinkat("/proc/thread-self/fd/2", &dev_fd, "stderr");
        drop(dev_fd);

        // All set, change root directory.
        // Move old mount over itself.
        // This point is too early for safe_chdir,
        // here we run without confinement.
        fchdir(root_fd)?;
        pivot_root(".", ".")?;

        // Unmount old root directory.
        umount2(".", MntFlags::MNT_DETACH)?;

        // Ensure CWD equals root.
        chdir("/")?;
    }

    Ok(())
}

/// Set up network namespace.
#[expect(clippy::cognitive_complexity)]
pub fn ns_setup_net<Fd: AsFd>(
    proxy: Option<Fd>,
    proxy_addr: IpAddr,
    proxy_port: u16,
    proxy_debug: bool,
) -> SydResult<()> {
    // Bring up loopback device for net namespace.
    let loindex = loopback_setup().inspect_err(|errno| {
        error!("ctx": "setup_network_namespace", "op": "setup_loopback",
            "err": *errno as i32,
            "msg": format!("set up loopback network device failed: {errno}"),
            "tip": "configure your system to allow network namespaces");
    })?;

    if proxy_debug {
        warn!("ctx": "setup_network_namespace", "op": "setup_loopback",
            "idx": loindex,
            "msg": format!("loopback network device is up with index:{loindex:#x}"));
    } else {
        info!("ctx": "setup_network_namespace", "op": "setup_loopback",
            "idx": loindex,
            "msg": format!("loopback network device is up with index:{loindex:#x}"));
    }

    // Handle proxy sandboxing as necessary.
    let stream_child = if let Some(stream_child) = proxy {
        stream_child
    } else {
        return Ok(());
    };

    let ipv = if proxy_addr.is_ipv6() { 6 } else { 4 };
    let fml = if ipv == 6 {
        AddressFamily::Inet6
    } else {
        AddressFamily::Inet
    };
    let lfd = socket(
        fml,
        SockType::Stream,
        SockFlag::SOCK_NONBLOCK | SockFlag::SOCK_CLOEXEC,
        None,
    )?;

    match proxy_addr {
        IpAddr::V4(addr_v4) => {
            let sockaddr = SockaddrIn::from(SocketAddrV4::new(addr_v4, proxy_port));
            bind(lfd.as_raw_fd(), &sockaddr)
        }
        IpAddr::V6(addr_v6) => {
            let sockaddr = SockaddrIn6::from(SocketAddrV6::new(addr_v6, proxy_port, 0, 0));
            bind(lfd.as_raw_fd(), &sockaddr)
        }
    }
    .inspect_err(|errno| {
        error!("ctx": "setup_network_namespace", "op": "bind_proxy",
            "msg": format!("bind proxy to IPv{ipv} {proxy_addr}!{proxy_port} failed: {errno}"),
            "err": *errno as i32);
    })?;

    if proxy_debug {
        warn!("ctx": "setup_network_namespace", "op": "bind_proxy",
            "msg": format!("proxy is now listening incoming IPv{ipv} requests from {proxy_addr}!{proxy_port}"));
    } else {
        info!("ctx": "setup_network_namespace", "op": "bind_proxy",
            "msg": format!("proxy is now listening incoming IPv{ipv} requests from {proxy_addr}!{proxy_port}"));
    }

    let lfd = lfd.as_raw_fd();
    send_with_fd(&stream_child, &[0u8; 1], &[lfd]).inspect_err(|errno| {
        error!("ctx": "setup_network_namespace", "op": "send_proxy_fd",
            "fd": lfd, "err": *errno as i32,
            "msg": format!("send proxy file descriptor {lfd} to syd-tor failed: {errno}"));
    })?;

    if proxy_debug {
        warn!("ctx": "setup_network_namespace", "op": "send_proxy_fd", "fd": lfd,
            "msg": format!("sent proxy fd {lfd} to syd-tor, IPv{ipv} traffic forwarding is now started \\o/"));
        warn!("ctx": "setup_network_namespace", "op": "send_proxy_fd", "syd": "ping",
            "msg": "Change return success. Going and coming without error.");
    } else {
        info!("ctx": "setup_network_namespace", "op": "send_proxy_fd", "fd": lfd,
            "msg": format!("sent proxy fd {lfd} to syd-tor, IPv{ipv} traffic forwarding is now started \\o/"));
        info!("ctx": "setup_network_namespace", "op": "send_proxy_fd", "syd": "ping",
            "msg": "Change return success. Going and coming without error.");
    }

    Ok(())
}

/// Set up syd-tor for Proxy sanboxing.
#[expect(clippy::cognitive_complexity)]
pub fn ns_setup_tor(
    proxy_ext_addr: IpAddr,
    proxy_ext_port: u16,
    proxy_ext_unix: Option<&XPath>,
    proxy_repr: &str,
    proxy_debug: bool,
) -> SydResult<OwnedFd> {
    // TIP to be used in logging.
    const TIP: &str = "set sandbox/proxy:off";

    // Create a PIDFd of this process.
    // PIDFD_NONBLOCK is equivalent to O_NONBLOCK,
    // we use the latter because bionic libc doesn't define former yet.
    #[expect(clippy::cast_sign_loss)]
    let pidfd = pidfd_open(Pid::this(), OFlag::O_NONBLOCK.bits() as u32).inspect_err(|errno| {
        error!("ctx": "setup_tor", "op": "pidfd_open",
                "msg": format!("syd-tor pidfd_open error: {errno}"),
                "tip": TIP, "err": *errno as i32);
    })?;

    // Create a UNIX socket pair.
    let (stream_parent, stream_child) = UnixStream::pair().inspect_err(|error| {
        error!("ctx": "setup_tor", "op": "socketpair",
                "msg": format!("syd-tor socketpair error: {error}"),
                "tip": TIP, "err": err2no(error) as i32);
    })?;

    // Unset the CLOEXEC flags on the file descriptors.
    // PIDFds and Rust sockets are automatically CLOEXEC.
    set_cloexec(&pidfd, false)?;
    set_cloexec(&stream_parent, false)?;

    // Prepare environment of the syd-tor process.
    // Filter the environment variables to only include the list below:
    // 1. LD_LIBRARY_PATH
    // 2. SYD_TOR_RULES
    // We do not need to pass SYD_TOR_DEBUG because we use -d as needed.
    let safe_env: &[&[u8]] = &[b"LD_LIBRARY_PATH", b"SYD_TOR_RULES"];

    // Spawn syd-tor process outside the namespace, pass one end of the socket-pair to it.
    // ns_init_tor sets process name which syd(1) recognizes.
    let mut cmd = Command::new("/proc/self/exe");
    cmd.arg0("syd-tor");
    cmd.stdin(Stdio::inherit());
    cmd.stdout(Stdio::inherit());
    cmd.env_clear();
    cmd.envs(
        env::vars_os().filter(|(key, _)| safe_env.iter().any(|env| is_equal(key.as_bytes(), env))),
    );
    if proxy_debug {
        cmd.arg("-d");
        cmd.stderr(Stdio::inherit());
    } else {
        cmd.stderr(Stdio::null());
    }
    let mut buf = itoa::Buffer::new();
    cmd.arg("-p");
    cmd.arg(buf.format(pidfd.as_raw_fd()));
    cmd.arg("-i");
    cmd.arg(buf.format(stream_parent.as_raw_fd()));
    // proxy/ext/unix has precedence over proxy/ext/host.
    if let Some(ref proxy_ext_unix) = proxy_ext_unix {
        cmd.arg("-u");
        cmd.arg(proxy_ext_unix);
    } else {
        cmd.arg("-o");
        cmd.arg(format!("{proxy_ext_addr}:{proxy_ext_port}"));
    }
    // SAFETY: See documentation in ns_init_tor.
    unsafe { cmd.pre_exec(|| Ok(ns_init_tor()?)) };
    cmd.spawn().inspect_err(|error| {
        let errno = err2no(error);
        error!("ctx": "setup_tor", "op": "spawn",
            "msg": format!("syd-tor spawn error: {error}"),
            "tip": TIP, "err": errno as i32);
    })?;
    drop(pidfd);
    if proxy_debug {
        warn!("ctx": "setup_tor", "op": "forward_net",
            "msg": format!("syd-tor is now forwarding external traffic to {proxy_repr}"));
    } else {
        info!("ctx": "setup_tor", "op": "forward_net",
            "msg": format!("syd-tor is now forwarding external traffic to {proxy_repr}"));
    }

    // Pass the other end of the socket-pair to the new namespace.
    Ok(stream_child.into())
}

// Initialize Proxy sandboxing.
//
// This runs early in fork process before syd-tor(1) is spawned.
// Confinement happens in two-stages:
// 1. in Command::pre_exec before syd-tor(1) is spawned.
// 2. syd-tor(1) confining itself before main loop.
//
// This confinement is somewhat repetitive, however it reduces the blast
// radius when Syd is misguided into executing a malicious syd-tor(1)
// binary.
fn ns_init_tor() -> Result<(), Errno> {
    // SAFETY:
    // 1. Set non-new-privs attribute.
    // 2. Confine landlock-scope on Linux>=6.12.
    // 3. Set parent death signal to SIGKILL.
    // 4. Create a new session.
    // 5. Set umask(2) to a sane value.
    set_no_new_privs()?;
    confine_landlock_scope()?;
    set_pdeathsig(Some(Signal::SIGKILL))?;
    setsid()?;
    umask(Mode::from_bits_truncate(0o777));
    Ok(())
}

#[expect(clippy::unnecessary_cast)]
const SIOCGIFINDEX: u64 = libc::SIOCGIFINDEX as u64;
#[expect(clippy::unnecessary_cast)]
const SIOCGIFFLAGS: u64 = libc::SIOCGIFFLAGS as u64;
#[expect(clippy::unnecessary_cast)]
const SIOCSIFFLAGS: u64 = libc::SIOCSIFFLAGS as u64;

/// Functionally equivalent to "ifconfig lo up".
///
/// Returns loopback interface index.
#[expect(clippy::cognitive_complexity)]
pub fn loopback_setup() -> Result<i32, Errno> {
    // Create a socket
    let sock = socket(
        AddressFamily::Inet,
        SockType::Stream,
        SockFlag::empty(),
        None,
    )?;

    // Prepare the interface request
    let mut ifreq = libc::ifreq {
        #[expect(clippy::cast_possible_wrap)]
        ifr_name: [
            b'l' as libc::c_char,
            b'o' as libc::c_char,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        // SAFETY: Manually initialize ifr_ifru.
        ifr_ifru: unsafe { std::mem::zeroed() },
    };

    // SAFETY: Request loopback network device index.
    let loindex: i32 = unsafe {
        let mut ifr_index: libc::ifreq = std::mem::zeroed();
        ifr_index.ifr_name = ifreq.ifr_name;
        Errno::result(libc::syscall(
            libc::SYS_ioctl,
            sock.as_raw_fd(),
            SIOCGIFINDEX as libc::c_ulong,
            &mut ifr_index,
        ))?;
        // HACK: ifr_ifru is a union but libc crate does not define ifru_ivalue,
        // which is a libc::c_int, so here we refer to it with ifru_mtu which
        // is the same type.
        ifr_index.ifr_ifru.ifru_mtu
    };

    // Set BIGTCP to LOOPBACK_BIGTCP_MAX if available.
    // Note, we _must_ do this before setting up the network device.
    use crate::config::LOOPBACK_BIGTCP_MAX;
    match loopback_set_bigtcp(loindex, LOOPBACK_BIGTCP_MAX) {
        Ok(_) => {
            info!("ctx": "setup_network_namespace", "op": "set_bigtcp_loopback",
                "msg": "loopback network device has BIGTCP set",
                "max": LOOPBACK_BIGTCP_MAX);
        }
        Err(errno) => {
            info!("ctx": "setup_network_namespace", "op": "set_bigtcp_loopback",
                "msg": format!("set BIGTCP for loopback network device error: {errno}"),
                "err": errno as i32);
        }
    };

    // SAFETY: Get the current flags.
    Errno::result(unsafe {
        libc::syscall(
            libc::SYS_ioctl,
            sock.as_raw_fd(),
            SIOCGIFFLAGS as libc::c_ulong,
            &mut ifreq,
        )
    })?;

    // Modify the flags to bring up the interface.
    //
    // SAFETY: We're accessing the field of a union here.
    #[expect(clippy::cast_possible_truncation)]
    unsafe {
        ifreq.ifr_ifru.ifru_flags |= (libc::IFF_UP | libc::IFF_RUNNING) as libc::c_short
    };

    // SAFETY: Set the new flags.
    Errno::result(unsafe {
        libc::syscall(
            libc::SYS_ioctl,
            sock.as_raw_fd(),
            SIOCSIFFLAGS as libc::c_ulong,
            &mut ifreq,
        )
    })?;

    Ok(loindex)
}

// libc crate does not define struct nl from linux/rtnetlink.h.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct nlattr {
    nla_len: u16,
    nla_type: u16,
}

// libc crate does not define struct nlmsg from linux/rtnetlink.h.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct nlmsg {
    hdr: libc::nlmsghdr,
    info: ifinfomsg,
    attrs: [u8; 64],
}

// libc crate does not define struct ifinfomsg from linux/rtnetlink.h yet.
#[repr(C)]
#[derive(Debug, Copy, Clone)]
struct ifinfomsg {
    family: u8,
    pad: u8,
    ifi_type: u16, // ARPHRD_*
    index: i32,    // Interface index
    flags: u32,    // IFF_* flags
    change: u32,   // IFF_* change mask
}

// These values are based on the Linux kernel headers.
const IFLA_GRO_IPV4_MAX_SIZE: libc::c_ushort = 0x40;
const IFLA_GRO_MAX_SIZE: libc::c_ushort = 0x3a;
const IFLA_GSO_IPV4_MAX_SIZE: libc::c_ushort = 0x3f;
const IFLA_GSO_MAX_SIZE: libc::c_ushort = 0x29;

/// Functionally equivalent to "ip link set dev $ifindex g{r,s}o_max_size $max_size",
/// which sets BIGTCP if available, see: https://lwn.net/Articles/884104/
///
/// Requires loopback interface index as argument.
pub fn loopback_set_bigtcp(ifindex: i32, max_size: u32) -> Result<(), Errno> {
    // Set BIGTCP to max_size if available.
    use netlink_sys::{constants::*, Socket, SocketAddr};

    // SAFETY: create netlink socket using netlink_sys for NETLINK_ROUTE.
    let mut sock = Socket::new(NETLINK_ROUTE)
        .map_err(|e| Errno::from_raw(e.raw_os_error().unwrap_or(libc::ENOSYS)))?;
    sock.bind(&SocketAddr::new(0, 0))
        .map_err(|e| Errno::from_raw(e.raw_os_error().unwrap_or(libc::ENOSYS)))?;

    // SAFETY: Zero initialize. netlink message.
    let mut msg: nlmsg = unsafe { std::mem::zeroed() };

    // SAFETY: Set up netlink header.
    let nl_hdr = &mut msg.hdr;
    #[expect(clippy::arithmetic_side_effects)]
    #[expect(clippy::cast_possible_truncation)]
    {
        nl_hdr.nlmsg_len = (size_of::<libc::nlmsghdr>() + size_of::<ifinfomsg>()) as u32;
        nl_hdr.nlmsg_type = libc::RTM_NEWLINK;
        nl_hdr.nlmsg_flags = (libc::NLM_F_REQUEST | libc::NLM_F_ACK) as u16;
        nl_hdr.nlmsg_seq = 1;
        nl_hdr.nlmsg_pid = 0;
    }

    // SAFETY: Populate ifinfomsg.
    let info = &mut msg.info;
    #[expect(clippy::cast_possible_truncation)]
    {
        info.family = libc::AF_UNSPEC as u8;
        info.index = ifindex;
        info.change = u32::MAX;
    }

    let mut offset = 0;
    for &kind in &[
        IFLA_GRO_IPV4_MAX_SIZE,
        IFLA_GRO_MAX_SIZE,
        IFLA_GSO_IPV4_MAX_SIZE,
        IFLA_GSO_MAX_SIZE,
    ] {
        // SAFETY: Write attribute header using unaligned write.
        #[expect(clippy::arithmetic_side_effects)]
        #[expect(clippy::cast_possible_truncation)]
        unsafe {
            let attr = nlattr {
                nla_type: kind,
                nla_len: (size_of::<nlattr>() + size_of::<u32>()) as u16,
            };
            let attr_ptr = msg.attrs.as_mut_ptr().add(offset);
            std::ptr::write_unaligned(attr_ptr.cast::<nlattr>(), attr);

            // Write u32 payload after the attribute header.
            let val_ptr = attr_ptr.add(size_of::<nlattr>());
            std::ptr::write_unaligned(val_ptr.cast::<u32>(), max_size);
        }

        #[expect(clippy::arithmetic_side_effects)]
        #[expect(clippy::cast_lossless)]
        #[expect(clippy::cast_sign_loss)]
        #[expect(clippy::cast_possible_truncation)]
        {
            let nla_len = (size_of::<nlattr>() + size_of::<u32>()) as u16;
            // SAFETY: NLA_ALIGN is a libc macro that computes alignment padding;
            // the input is a valid attribute length.
            offset += unsafe { libc::NLA_ALIGN(nla_len as libc::c_int) } as usize;
        }
    }

    #[expect(clippy::arithmetic_side_effects)]
    #[expect(clippy::cast_possible_truncation)]
    {
        msg.hdr.nlmsg_len += offset as u32;
    }

    // SAFETY: Cast to byte slice for send.
    let buf = unsafe {
        std::slice::from_raw_parts(
            std::ptr::addr_of!(msg) as *const u8,
            msg.hdr.nlmsg_len as usize,
        )
    };
    sock.send(buf, 0)
        .map_err(|e| Errno::from_raw(e.raw_os_error().unwrap_or(libc::ENOSYS)))?;

    // Receive response.
    let (buf, _) = sock
        .recv_from_full()
        .map_err(|e| Errno::from_raw(e.raw_os_error().unwrap_or(libc::ENOSYS)))?;

    // Check response messages for error.
    let mut offset = 0;
    #[expect(clippy::arithmetic_side_effects)]
    while offset + size_of::<libc::nlmsghdr>() <= buf.len() {
        // SAFETY: Unaligned read of netlink message header from validated bounds.
        let hdr: libc::nlmsghdr =
            unsafe { std::ptr::read_unaligned(buf.as_ptr().add(offset) as *const libc::nlmsghdr) };

        let len = hdr.nlmsg_len as usize;
        if len < size_of::<libc::nlmsghdr>() || offset + len > buf.len() {
            return Err(Errno::EINVAL);
        }

        // Check for error message.
        #[expect(clippy::cast_possible_truncation)]
        if hdr.nlmsg_type == libc::NLMSG_ERROR as libc::c_ushort
            && len >= size_of::<libc::nlmsghdr>() + size_of::<libc::nlmsgerr>()
        {
            // SAFETY: Unaligned read; enough data to safely parse nlmsgerr.
            let err: libc::nlmsgerr = unsafe {
                std::ptr::read_unaligned(
                    buf.as_ptr().add(offset + size_of::<libc::nlmsghdr>()) as *const libc::nlmsgerr
                )
            };
            if err.error != 0 {
                return Err(Errno::from_raw(-err.error));
            }
        }

        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::cast_possible_wrap)]
        #[expect(clippy::cast_sign_loss)]
        {
            // SAFETY: nlmsg_len is kernel-aligned; advance to next message.
            offset += unsafe { libc::NLA_ALIGN(len as i32) as usize };
        }
    }

    Ok(())
}
