//
// Syd: rock-solid application kernel
// src/workers/emu.rs: `syd_emu' emulator threads
//
// Copyright (c) 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
// Based in part upon rusty_pool which is:
//     Copyright (c) Robin Friedli <robinfriedli@icloud.com>
//     SPDX-License-Identifier: Apache-2.0
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    mem::MaybeUninit,
    option::Option,
    os::fd::RawFd,
    sync::{
        atomic::{AtomicBool, Ordering},
        Arc, RwLock,
    },
    thread,
};

#[expect(deprecated)]
use libc::SOCK_PACKET;
use libc::{AF_ALG, AF_INET, AF_INET6, AF_NETLINK, AF_UNIX, EACCES, SOCK_RAW};
use libseccomp::{scmp_cmp, RawSyscall, ScmpAction, ScmpArch, ScmpFilterContext, ScmpSyscall};
use libseccomp_sys::{const_scmp_filter_ctx, seccomp_load, seccomp_notify_receive};
use nix::{
    errno::Errno,
    fcntl::OFlag,
    sched::{unshare, CloneFlags},
    unistd::{Gid, Pid, Uid},
};

use crate::{
    alert,
    compat::{seccomp_notif, seccomp_notif_resp},
    config::*,
    confine::{
        confine_scmp, confine_scmp_clone, confine_scmp_clone3, confine_scmp_execveat,
        confine_scmp_faccessat2, confine_scmp_fallocate, confine_scmp_fchdir, confine_scmp_fchmod,
        confine_scmp_fchmodat, confine_scmp_fchmodat2, confine_scmp_fchown, confine_scmp_fchownat,
        confine_scmp_fcntl, confine_scmp_fremovexattr, confine_scmp_fsetxattr,
        confine_scmp_ftruncate, confine_scmp_ioctl_syd, confine_scmp_linkat,
        confine_scmp_lremovexattr, confine_scmp_lsetxattr, confine_scmp_madvise,
        confine_scmp_memfd_create, confine_scmp_memfd_secret, confine_scmp_mkdirat,
        confine_scmp_mknodat, confine_scmp_open, confine_scmp_openat, confine_scmp_openat2,
        confine_scmp_pidfd_getfd, confine_scmp_pidfd_open, confine_scmp_pidfd_send_signal,
        confine_scmp_pipe2, confine_scmp_prctl, confine_scmp_removexattrat, confine_scmp_renameat2,
        confine_scmp_setxattrat, confine_scmp_sigaction, confine_scmp_symlinkat,
        confine_scmp_truncate, confine_scmp_umask, confine_scmp_uname, confine_scmp_unlinkat,
        confine_scmp_unshare, confine_scmp_utimensat, scmp_add_setid_rules, scmp_arch_raw,
        ScmpNotifReq, SydArch, Sydcall, EIDRM, EOWNERDEAD, X32_SYSCALL_BIT,
    },
    cookie::{
        CookieIdx, SYSCOOKIE_POOL, SYS_ACCEPT4, SYS_BIND, SYS_CONNECT, SYS_SOCKET, SYS_SOCKETPAIR,
    },
    err::{err2no, SydJoinHandle, SydResult},
    fd::close,
    fs::seccomp_notify_respond,
    hook::HandlerMap,
    info,
    proc::proc_get_vma,
    req::UNotifyEventRequest,
    sandbox::{Options, Sandbox, SandboxGuard},
    timer::AlarmTimer,
    workers::{WorkerCache, WorkerData},
};

#[derive(Clone)]
pub(crate) struct Worker {
    fd: RawFd,
    options: Options,
    cache: Arc<WorkerCache>,
    sandbox: Arc<RwLock<Sandbox>>,
    handlers: Arc<HandlerMap>,
    keep_alive: Option<u16>,
    should_exit: Arc<AtomicBool>,
    worker_data: Arc<WorkerData>,
}

impl Worker {
    pub(crate) fn new(
        fd: RawFd,
        cache: Arc<WorkerCache>,
        sandbox: Arc<RwLock<Sandbox>>,
        handlers: Arc<HandlerMap>,
        keep_alive: Option<u16>,
        should_exit: Arc<AtomicBool>,
        worker_data: Arc<WorkerData>,
    ) -> Self {
        let my_sandbox = SandboxGuard::Read(sandbox.read().unwrap_or_else(|err| err.into_inner()));
        let options = *my_sandbox.options;
        drop(my_sandbox); // release the read lock.

        Self {
            fd,
            options,
            cache,
            sandbox,
            handlers,
            keep_alive,
            should_exit,
            worker_data,
        }
    }

    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn try_spawn(
        self,
        ctx: Option<&ScmpFilterContext>,
    ) -> Result<SydJoinHandle<()>, Errno> {
        // SAFETY: ScmpFilterContext is not Send,
        // so we cannot pass it between threads.
        // Therefore we pass a pointer which is
        // owned by the monitor thread. This
        // pointer is guaranteed to be valid
        // throughout Syd's lifetime.
        let mut ctx = ctx.map(|ctx| ctx.as_ptr() as usize);

        thread::Builder::new()
            .name("syd_emu".to_string())
            .stack_size(EMU_STACK_SIZE)
            .spawn(move || {
                // Unshare:
                // 1. CLONE_FS so cwd and umask are per-thread.
                // 2. CLONE_FILES so file descriptor table is per-thread.
                // 3. CLONE_SYSVSEM so System V semaphores are per-thread.
                //
                // CLONE_FILES is not set if:
                // 1. Crypt sandboxing is on because emulator threads have
                //    to share memory fds with AES threads.
                // 2. KCOV is on because emulator threads have to share
                //    memory fds with the main thread.
                let mut unshare_flags = CloneFlags::CLONE_FS | CloneFlags::CLONE_SYSVSEM;
                let is_crypt = self.cache.crypt_map.is_some();
                if !cfg!(feature = "kcov") && !is_crypt {
                    unshare_flags.insert(CloneFlags::CLONE_FILES);
                }

                // SAFETY: We use exit_group(2) here to bail,
                // because this unsharing is a critical safety feature.
                if let Err(errno) = unshare(unshare_flags) {
                    alert!("ctx": "boot", "op": "unshare_emu_thread",
                        "msg": format!("failed to unshare({unshare_flags:?}): {errno}"),
                        "err": errno as i32);
                    std::process::exit(101);
                }

                // SAFETY: We use exit_group(2) here to bail,
                // because expiring idle threads is a critical safety feature.
                let mut timer = if let Some(keep_alive) = self.keep_alive {
                    match AlarmTimer::from_seconds(keep_alive.into()) {
                        Ok(timer) => Some(timer),
                        Err(errno) => {
                            alert!("ctx": "boot", "op": "timer_create_for_emu_thread",
                                "msg": format!("failed to set up timer: {errno}"),
                                "err": errno as i32);
                            std::process::exit(101);
                        }
                    }
                } else {
                    None
                };

                // Create sentinel, that will handle graceful teardown.
                let mut sentinel = Sentinel::new(&self);

                // Thread successfully started, increment total worker count.
                self.worker_data.increment_worker_total();

                loop {
                    // Confine thread if sandbox is locked.
                    if let Some(filter) = ctx {
                        if Sandbox::is_locked_once() {
                            // Deny critical system calls on sandbox lock.
                            //
                            // SAFETY: We use exit_group(2) here to bail,
                            // because this confinement is a critical safety feature.
                            if let Err(error) =
                                confine_scmp(ScmpAction::KillProcess, EMU_LOCK_SYSCALLS)
                            {
                                let errno = error.errno().unwrap_or(Errno::ENOSYS);
                                alert!("ctx": "boot", "op": "confine_emu_thread",
                                    "msg": format!("failed to confine: {error}"),
                                    "err": errno as i32);
                                std::process::exit(101);
                            }

                            let safe_setid = self.options.intersects(
                                Options::OPT_ALLOW_SAFE_SETUID | Options::OPT_ALLOW_SAFE_SETGID,
                            );
                            info!("ctx": "confine", "op": "confine_emu_thread",
                                "msg": format!("emulator thread confined with{} SROP mitigation",
                                    if safe_setid { "out" } else { "" }));

                            // SAFETY: Filter pointer is owned by the monitor thread, and
                            // it's valid for the lifetime of the Syd process.
                            let error = unsafe { seccomp_load(filter as const_scmp_filter_ctx) };
                            ctx = None; // Leak ctx intentionally.

                            // SAFETY: We use exit_group(2) here to bail,
                            // because this confinement is a critical safety feature.
                            if error != 0 {
                                let errno = Errno::from_raw(error.abs());
                                alert!("ctx": "boot", "op": "confine_emu_thread",
                                    "msg": format!("failed to confine: {error}"),
                                    "err": errno as i32);
                                std::process::exit(101);
                            }
                        }
                    }

                    // Receive seccomp notification.
                    let request = if let Ok(request) = self.receive(&mut timer) {
                        request
                    } else {
                        // Critical error, decrement worker total and exit.
                        self.worker_data.decrement_worker_total();
                        break;
                    };

                    if let Some(request) = request {
                        // Mark thread busy.
                        sentinel.seccomp_id = Some(request.id);
                        self.worker_data.increment_worker_busy();

                        // Handle request.
                        self.handle(request);

                        // Mark thread idle again.
                        sentinel.seccomp_id = None;
                        self.worker_data.decrement_worker_busy();
                    } // else process died-midway, continue.

                    // Exit if there's nothing else to handle.
                    if self.should_exit.load(Ordering::Relaxed) {
                        // Time to exit.
                        break;
                    }
                }

                Ok(())
            })
            .map_err(|err| err2no(&err))
    }

    fn receive(&self, timer: &mut Option<AlarmTimer>) -> Result<Option<ScmpNotifReq>, Errno> {
        // Receive and return request.
        // Break if file descriptor was closed.
        // Ignore rest of the errors as we cannot handle them,
        // e.g: EINTR|ENOENT: task is killed mid-way.
        match self.read(timer) {
            Ok(request) => Ok(Some(request)),
            Err(Errno::EBADF) => Err(Errno::EBADF),
            Err(Errno::EINTR) if timer.is_some() => Err(Errno::EINTR),
            Err(_) => Ok(None),
        }
    }

    fn read(&self, timer: &mut Option<AlarmTimer>) -> Result<ScmpNotifReq, Errno> {
        // Use libc::seccomp_notif rather than libseccomp_sys's.
        // The latter is opaque and requires us to do a heap
        // allocation which we don't always want.
        let mut req: MaybeUninit<seccomp_notif> = MaybeUninit::zeroed();

        if let Some(timer) = timer {
            // SAFETY: Something is awfully wrong if we cannot
            // set the timer so we panic here to indicate this.
            #[expect(clippy::disallowed_methods)]
            timer.start().expect("timer_settime");
        }

        // SAFETY: libseccomp's wrapper allocates each call.
        // Note: EINTR may also mean child killed by signal!
        let res =
            Errno::result(unsafe { seccomp_notify_receive(self.fd, req.as_mut_ptr().cast()) });

        if let Some(timer) = timer {
            // SAFETY: See above.
            #[expect(clippy::disallowed_methods)]
            timer.stop().expect("timer_settime");
        }
        res?;

        // SAFETY: seccomp_notify_receive returned success.
        // Request is populated and accessing it is safe.
        let req = ScmpNotifReq::from_sys(unsafe { req.assume_init() })?;

        if req.id != 0 && req.pid != 0 {
            Ok(req)
        } else {
            // interrupted/task killed mid-way.
            Err(Errno::EINTR)
        }
    }

    #[expect(clippy::cognitive_complexity)]
    fn handle(&self, mut req: ScmpNotifReq) {
        // Correct architecture for x32 syscalls.
        if req.data.arch == ScmpArch::X8664
            && req.data.syscall.as_raw_syscall() & X32_SYSCALL_BIT != 0
        {
            req.data.arch = ScmpArch::X32;
        }

        // Lookup the system call handler, panic if not found.
        let syscall = Sydcall(req.data.syscall, scmp_arch_raw(req.data.arch));
        let handler = if let Some(handler) = self.handlers.get(&syscall) {
            handler
        } else {
            unreachable!("BUG: Missing hook for request {req:?}!");
        };

        let request = UNotifyEventRequest::new(
            req,
            syscall,
            self.fd,
            Arc::clone(&self.cache),
            Arc::clone(&self.sandbox),
        );
        let mut response = handler(request);

        // Check for the following pseudo errnos:
        // 1. EIDRM:
        //    a. A previous SECCOMP_IOCTL_NOTIF_ADDFD has replied to the request with
        //       SECCOMP_ADDFD_FLAG_SEND already, no need to return a reply again here.
        //    b. A read-write encryption request was made and the encrypted fd has
        //       already been returned as a reply with SECCOMP_IOCTL_NOTIF_ADDFD with
        //       the flag SECCOMP_ADDFD_FLAG_SEND.
        // 2. EOWNERDEAD: Enter ghost mode.
        let ghost = match response.error {
            EIDRM if response.id == 0 && response.val == 0 && response.flags == 0 => return,
            EOWNERDEAD if response.id == 0 && response.val == 0 && response.flags == 0 => {
                #[expect(clippy::cast_possible_wrap)]
                let pid = Pid::from_raw(req.pid as libc::pid_t);
                let vma = proc_get_vma(pid, req.data.instr_pointer).ok();
                crate::warn!("ctx": "confine", "op": "enter_ghost_mode", "pid": req.pid,
                    "sys": syscall, "arch": SydArch::from(req.data.arch), "args": req.data.args,
                    "ip": req.data.instr_pointer, "src": vma);

                // 1. Correct seccomp user notification ID.
                // 2. Correct the pseudo errno(3) EOWNERDEAD back to success.
                response.id = req.id;
                response.error = 0;
                response.val = 0;

                true
            }
            _ => false,
        };

        let response = seccomp_notif_resp {
            id: response.id,
            val: response.val,
            error: response.error,
            flags: response.flags,
        };

        // EINTR is not retried because it may mean child is signaled.
        // ENOENT means child died mid-way.
        // Nothing else we can do on errors here.
        let _ = seccomp_notify_respond(self.fd, std::ptr::addr_of!(response));

        // Finalize ghost mode as necessary.
        if ghost {
            let _ = close(self.fd);

            // Inform the monitor thread and other emulator threads to exit,
            // and notify the syd_aes thread as necessary.
            self.should_exit.store(true, Ordering::Relaxed);
            if let Some(ref crypt_map) = self.cache.crypt_map {
                let (aes_map, cvar) = &**crypt_map;
                let _aes_map = aes_map.lock().unwrap_or_else(|e| e.into_inner());
                cvar.notify_one();
            } // Lock is released here.
        }
    }

    /// Confine Worker thread.
    #[expect(clippy::cognitive_complexity)]
    pub(crate) fn prepare_confine(
        seccomp_fd: RawFd,
        options: Options,
        is_crypt: bool,
        transit_uids: &[(Uid, Uid)],
        transit_gids: &[(Gid, Gid)],
    ) -> SydResult<ScmpFilterContext> {
        // Create seccomp filter with default action.
        let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

        // Enforce the NO_NEW_PRIVS functionality before
        // loading the seccomp filter into the kernel.
        ctx.set_ctl_nnp(true)?;

        // Disable Speculative Store Bypass mitigations
        // with trace/allow_unsafe_exec_speculative:1
        ctx.set_ctl_ssb(options.allow_unsafe_exec_speculative())?;

        // DO NOT synchronize filter to all threads.
        // Main thread will confine itself.
        ctx.set_ctl_tsync(false)?;

        // We kill for bad system call and bad arch.
        ctx.set_act_badarch(ScmpAction::KillProcess)?;

        // Use a binary tree sorted by syscall number if possible.
        let _ = ctx.set_ctl_optimize(2);

        // SAFETY: Do NOT add supported architectures to the filter.
        // This ensures Syd can never run a non-native system call,
        // which we do not need at all.
        // seccomp_add_architectures(&mut ctx)?;

        // Allow clones without namespace flags.
        confine_scmp_clone(&mut ctx)?;
        // Deny clone3 with ENOSYS for compatibility.
        confine_scmp_clone3(&mut ctx)?;

        // Allow safe madvise(2) advice.
        confine_scmp_madvise(&mut ctx)?;

        // SafeSetId is used when changing UID/GID.
        let safe_setid =
            options.intersects(Options::OPT_ALLOW_SAFE_SETUID | Options::OPT_ALLOW_SAFE_SETGID);

        // Syscall argument cookies may be disabled
        // at startup with trace/allow_unsafe_nocookie:1.
        let restrict_cookie = !options.allow_unsafe_nocookie();

        // Allow safe system calls.
        //
        // KCOV_SYSCALLS is empty in case `kcov` feature is disabled.
        // PROF_SYSCALLS is empty in case `prof` feature is disabled.
        for sysname in EMU_SYSCALLS
            .iter()
            .chain(FUTEX_SYSCALLS)
            .chain(GETID_SYSCALLS)
            .chain(KCOV_SYSCALLS)
            .chain(PROF_SYSCALLS)
            .chain(VDSO_SYSCALLS)
        {
            match ScmpSyscall::from_name(sysname) {
                Ok(syscall) => {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": format!("invalid or unsupported syscall {sysname}"));
                }
            }
        }

        let prctl_ops = EMU_PRCTL_OPS.iter().chain(if safe_setid {
            EMU_PRCTL_OPS_SAFESETID.iter()
        } else {
            [].iter()
        });
        confine_scmp_execveat(&mut ctx, restrict_cookie)?;
        confine_scmp_faccessat2(&mut ctx, restrict_cookie)?;
        confine_scmp_fallocate(&mut ctx, restrict_cookie)?;
        confine_scmp_fchdir(&mut ctx, restrict_cookie)?;
        confine_scmp_fchmod(&mut ctx, restrict_cookie)?;
        confine_scmp_fchmodat(&mut ctx, restrict_cookie)?;
        confine_scmp_fchmodat2(&mut ctx, restrict_cookie)?;
        confine_scmp_fchown(&mut ctx, restrict_cookie)?;
        confine_scmp_fchownat(&mut ctx, restrict_cookie)?;
        confine_scmp_fremovexattr(&mut ctx, restrict_cookie)?;
        confine_scmp_fsetxattr(&mut ctx, restrict_cookie)?;
        confine_scmp_ftruncate(&mut ctx, restrict_cookie)?;
        confine_scmp_ioctl_syd(&mut ctx, restrict_cookie, Some(seccomp_fd))?;
        confine_scmp_linkat(&mut ctx, restrict_cookie)?;
        confine_scmp_lremovexattr(&mut ctx, restrict_cookie)?;
        confine_scmp_lsetxattr(&mut ctx, restrict_cookie)?;
        confine_scmp_memfd_create(&mut ctx, restrict_cookie)?;
        confine_scmp_memfd_secret(&mut ctx, restrict_cookie)?;
        confine_scmp_mkdirat(&mut ctx, restrict_cookie)?;
        confine_scmp_mknodat(&mut ctx, restrict_cookie)?;
        confine_scmp_open(&mut ctx)?;
        confine_scmp_openat(&mut ctx)?;
        confine_scmp_openat2(&mut ctx, restrict_cookie)?;
        confine_scmp_pidfd_getfd(&mut ctx, restrict_cookie)?;
        confine_scmp_pidfd_open(&mut ctx, restrict_cookie)?;
        confine_scmp_pidfd_send_signal(&mut ctx, restrict_cookie)?;
        confine_scmp_prctl(&mut ctx, prctl_ops)?;
        confine_scmp_removexattrat(&mut ctx, restrict_cookie)?;
        confine_scmp_renameat2(&mut ctx, restrict_cookie)?;
        confine_scmp_setxattrat(&mut ctx)?;
        confine_scmp_sigaction(&mut ctx)?;
        confine_scmp_symlinkat(&mut ctx, restrict_cookie)?;
        confine_scmp_truncate(&mut ctx, restrict_cookie)?;
        confine_scmp_umask(&mut ctx, restrict_cookie)?;
        confine_scmp_uname(&mut ctx, restrict_cookie)?;
        confine_scmp_unlinkat(&mut ctx, restrict_cookie)?;
        confine_scmp_utimensat(&mut ctx)?;

        // KCOV needs unrestricted fcntl(2) calls.
        // TODO: Find out what fcntls it actually needs.
        if cfg!(feature = "kcov") {
            for sysname in ["fcntl", "fcntl64"] {
                if let Ok(syscall) = ScmpSyscall::from_name(sysname) {
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
            }
        } else {
            confine_scmp_fcntl(&mut ctx, EMU_FCNTL_OPS)?;
        }

        // Restrict pipe2(2) flags, and use syscall argument cookies.
        // Needed for Crypt sandboxing and KCOV.
        if is_crypt || cfg!(feature = "kcov") {
            confine_scmp_pipe2(&mut ctx, restrict_cookie, OFlag::O_CLOEXEC)?;
        }

        // Allow unshare(2) with CLONE_FS|CLONE_FILES|CLONE_SYSVSEM only.
        // Crypt sandboxing and KCOV require FD-share between EMU<->AES threads.
        let mut clone_flags = CloneFlags::CLONE_FS | CloneFlags::CLONE_SYSVSEM;
        if !cfg!(feature = "kcov") && !is_crypt {
            clone_flags.insert(CloneFlags::CLONE_FILES);
        };
        confine_scmp_unshare(&mut ctx, clone_flags)?;

        // socket(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::cast_sign_loss)]
        #[expect(clippy::useless_conversion)]
        #[expect(deprecated)]
        if let Some(syscall) = SYS_SOCKET.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Deny based on socket type.
            if !options.allow_unsafe_socket() {
                for ty in [SOCK_RAW as u64, SOCK_PACKET as u64] {
                    ctx.add_rule_conditional(
                        ScmpAction::Errno(EACCES),
                        syscall,
                        &[scmp_cmp!($arg1 == ty)],
                    )?;
                }
            }

            // Allow only specified socket domains.
            let domains = if !options.allow_unsupp_socket() {
                let mut domains = vec![AF_UNIX, AF_INET, AF_INET6, AF_NETLINK];
                if options.allow_safe_kcapi() {
                    domains.push(AF_ALG);
                }
                Some(domains)
            } else {
                None
            };

            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if restrict_cookie {
                rules.extend(&[
                    scmp_cmp!($arg3 == SYSCOOKIE_POOL.get(CookieIdx::SocketArg3).into()),
                    scmp_cmp!($arg4 == SYSCOOKIE_POOL.get(CookieIdx::SocketArg4).into()),
                    scmp_cmp!($arg5 == SYSCOOKIE_POOL.get(CookieIdx::SocketArg5).into()),
                ]);
            }

            if let Some(domains) = domains {
                for domain in domains {
                    rules.push(scmp_cmp!($arg0 == domain as u64));
                    ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
                    rules.pop();
                }
            } else if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("socket") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall socket");
                }
            }
        }

        // socketpair(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::cast_sign_loss)]
        #[expect(clippy::useless_conversion)]
        #[expect(deprecated)]
        if let Some(syscall) =
            SYS_SOCKETPAIR.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall))
        {
            // Deny based on socket type.
            if !options.allow_unsafe_socket() {
                for ty in [SOCK_RAW as u64, SOCK_PACKET as u64] {
                    ctx.add_rule_conditional(
                        ScmpAction::Errno(EACCES),
                        syscall,
                        &[scmp_cmp!($arg1 == ty)],
                    )?;
                }
            }

            // Allow only specified socket domains.
            let domains = if !options.allow_unsupp_socket() {
                let mut domains = vec![AF_UNIX, AF_INET, AF_INET6, AF_NETLINK];
                if options.allow_safe_kcapi() {
                    domains.push(AF_ALG);
                }
                Some(domains)
            } else {
                None
            };

            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if restrict_cookie {
                rules.extend(&[
                    scmp_cmp!($arg4 == SYSCOOKIE_POOL.get(CookieIdx::SocketpairArg4).into()),
                    scmp_cmp!($arg5 == SYSCOOKIE_POOL.get(CookieIdx::SocketpairArg5).into()),
                ]);
            }

            if let Some(domains) = domains {
                for domain in domains {
                    rules.push(scmp_cmp!($arg0 == domain as u64));
                    ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
                    rules.pop();
                }
            } else if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("socketpair") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall socketpair");
                }
            }
        }

        // accept4(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::useless_conversion)]
        if let Some(syscall) = SYS_ACCEPT4.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if restrict_cookie {
                rules.extend(&[
                    scmp_cmp!($arg4 == SYSCOOKIE_POOL.get(CookieIdx::Accept4Arg4).into()),
                    scmp_cmp!($arg5 == SYSCOOKIE_POOL.get(CookieIdx::Accept4Arg5).into()),
                ]);
            }

            if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("accept4") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall accept4");
                }
            }
        }

        // bind(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::useless_conversion)]
        if let Some(syscall) = SYS_BIND.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if restrict_cookie {
                rules.extend(&[
                    scmp_cmp!($arg3 == SYSCOOKIE_POOL.get(CookieIdx::BindArg3).into()),
                    scmp_cmp!($arg4 == SYSCOOKIE_POOL.get(CookieIdx::BindArg4).into()),
                    scmp_cmp!($arg5 == SYSCOOKIE_POOL.get(CookieIdx::BindArg5).into()),
                ]);
            }

            if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("bind") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall bind");
                }
            }
        }

        // connect(2) may be used only with syscall argument cookies.
        //
        // We only enforce this on architectures where the system call is direct,
        // and there's no socketcall(2) multiplexer indirection.
        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::useless_conversion)]
        if let Some(syscall) = SYS_CONNECT.map(|n| ScmpSyscall::from_raw_syscall(n as RawSyscall)) {
            // Secure using syscall argument cookies.
            let mut rules = vec![];
            if restrict_cookie {
                rules.extend(&[
                    scmp_cmp!($arg3 == SYSCOOKIE_POOL.get(CookieIdx::ConnectArg3).into()),
                    scmp_cmp!($arg4 == SYSCOOKIE_POOL.get(CookieIdx::ConnectArg4).into()),
                    scmp_cmp!($arg5 == SYSCOOKIE_POOL.get(CookieIdx::ConnectArg5).into()),
                ]);
            }

            if rules.is_empty() {
                ctx.add_rule(ScmpAction::Allow, syscall)?;
            } else {
                ctx.add_rule_conditional(ScmpAction::Allow, syscall, &rules)?;
            }
        } else {
            match ScmpSyscall::from_name("connect") {
                Ok(syscall) => {
                    // Allow socketcall(2).
                    ctx.add_rule(ScmpAction::Allow, syscall)?;
                }
                Err(_) => {
                    info!("ctx": "confine", "op": "allow_emu_syscall",
                        "msg": "invalid or unsupported syscall connect");
                }
            }
        }

        // Allow UID/GID changing system calls as necessary.
        let safe_setuid = options.allow_safe_setuid();
        let safe_setgid = options.allow_safe_setgid();
        if safe_setuid || safe_setgid {
            scmp_add_setid_rules(
                "emu",
                &mut ctx,
                safe_setuid,
                safe_setgid,
                transit_uids,
                transit_gids,
            )?;

            // SAFETY:
            // 1. cap{g,s}et is necessary to drop CAP_SET{U,G}ID after changing {U,G}ID.
            // 2. Signal system calls are necessary to handle reserved signals.
            // Note, {rt_,}sigreturn is already allowed for emulators to handle SIGALRM.
            for sysname in ["capget", "capset", "sigaction", "rt_sigaction"] {
                match ScmpSyscall::from_name(sysname) {
                    Ok(syscall) => {
                        ctx.add_rule(ScmpAction::Allow, syscall)?;
                    }
                    Err(_) => {
                        info!("ctx": "confine", "op": "allow_emu_syscall",
                            "msg": format!("invalid or unsupported syscall {sysname}"));
                    }
                }
            }
        }

        Ok(ctx)
    }
}

/// Type that exists to manage worker exit on panic.
///
/// This type is constructed once per `Worker` and implements `Drop` to
/// handle proper worker exit in case the worker panics when executing
/// the current task or anywhere else in its work loop. If the
/// `Sentinel` is dropped at the end of the worker's work loop and the
/// current thread is panicking, handle worker exit the same way as if
/// the task completed normally (if the worker panicked while executing
/// a submitted task) then clone the worker and start it with an initial
/// task of `None`.
struct Sentinel<'a> {
    seccomp_id: Option<u64>,
    worker_ref: &'a Worker,
}

impl<'a> Sentinel<'a> {
    fn new(worker_ref: &'a Worker) -> Sentinel<'a> {
        Self {
            seccomp_id: None,
            worker_ref,
        }
    }

    #[expect(clippy::arithmetic_side_effects)]
    fn deny_syscall(&self, seccomp_id: u64, errno: Errno) {
        let response = seccomp_notif_resp {
            id: seccomp_id,
            val: 0,
            error: -(errno as i32),
            flags: 0,
        };

        // EAGAIN|EINTR is retried.
        // ENOENT means child died mid-way.
        // Nothing else we can do on errors here.
        let _ = seccomp_notify_respond(self.worker_ref.fd, std::ptr::addr_of!(response));
    }
}

impl Drop for Sentinel<'_> {
    fn drop(&mut self) {
        if thread::panicking() {
            if let Some(seccomp_id) = self.seccomp_id {
                // Busy thread panicked.
                // SAFETY: Deny syscall in progress!
                self.deny_syscall(seccomp_id, Errno::EACCES);
                self.worker_ref.worker_data.decrement_both();
            } else {
                // Idle thread panicked.
                self.worker_ref.worker_data.decrement_worker_total();
            }
        }
    }
}
