//
// Syd: rock-solid application kernel
// src/fs.rs: Filesystem utilities
//
// Copyright (c) 2023, 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
// Based in part upon uutils coreutils package's src/lib/features/fs.rs which is:
//   (c) Joseph Crail <jbcrail@gmail.com>
//   (c) Jian Zeng <anonymousknight96 AT gmail.com>
// Tests base based in part upon gnulib packages' tests/test-canonicalize.c which is:
//   (c) Free Software Foundation, Inc.
// {chdir,getdir}_long() functions are based in part upon zsh/Src/compat.c which is:
//   (c) 1992-1997 Paul Falstad
//   SPDX-License-Identifier: ZSH
//
// SPDX-License-Identifier: GPL-3.0

//! Set of functions to manage files and symlinks

use std::{
    fs::{metadata, set_permissions, File},
    io::{Read, Seek, SeekFrom, Write},
    os::{
        fd::{AsFd, AsRawFd, FromRawFd, OwnedFd, RawFd},
        unix::fs::PermissionsExt,
    },
    path::Path,
    sync::LazyLock,
};

use libc::{
    c_int, c_long, c_ulong, c_void, clone, off64_t, siginfo_t, syscall, SYS_ioctl, SYS_kcmp,
    SYS_tgkill, CLONE_PIDFD, _IO, _IOR, _IOW, _IOWR,
};
use libseccomp::{ScmpFilterContext, ScmpSyscall};
use nix::{
    errno::Errno,
    fcntl::{OFlag, AT_FDCWD},
    sched::{CloneCb, CloneFlags},
    sys::{
        signal::{SigSet, Signal},
        stat::Mode,
        time::TimeSpec,
    },
    unistd::Pid,
    NixPath,
};

use crate::{
    compat::{
        getdents64, seccomp_notif, seccomp_notif_addfd, seccomp_notif_resp, AddWatchFlags,
        FallocateFlags,
    },
    config::*,
    confine::CLONE_NEWTIME,
    cookie::{CookieIdx, SYSCOOKIE_POOL},
    err::err2no,
    path::{XPath, XPathBuf, PATH_MAX},
    proc::PROCMAP_QUERY,
    retry::retry_on_eintr,
};

/// Check two processes share the same address space.
pub fn is_same_vm(pid1: Pid, pid2: Pid) -> Result<bool, Errno> {
    const KCMP_VM: u64 = 1;
    // SAFETY: There's no libc wrapper for kcmp.
    Ok(Errno::result(unsafe { syscall(SYS_kcmp, pid1.as_raw(), pid2.as_raw(), KCMP_VM) })? == 0)
}

/// Safe wrapper for inotify_add_watch.
pub fn inotify_add_watch<Fd: AsFd, P: ?Sized + NixPath>(
    fd: Fd,
    path: &P,
    mask: AddWatchFlags,
) -> Result<c_int, Errno> {
    // SAFETY: We need this because nix' `WatchDescriptor` is opaque...
    let res = path.with_nix_path(|cstr| unsafe {
        libc::inotify_add_watch(fd.as_fd().as_raw_fd(), cstr.as_ptr(), mask.bits())
    })?;

    Errno::result(res).map(|wd| wd as c_int)
}

/// Safe wrapper for fallocate64.
pub fn fallocate64<Fd: AsFd>(
    fd: Fd,
    mode: FallocateFlags,
    off: off64_t,
    len: off64_t,
) -> Result<(), Errno> {
    // SAFETY: nix does not have an interface for fallocate64 yet.
    Errno::result(unsafe { libc::fallocate64(fd.as_fd().as_raw_fd(), mode.bits(), off, len) })
        .map(drop)
}

/// Safe truncate64() wrapper.
pub fn truncate64<P: ?Sized + NixPath>(path: &P, len: off64_t) -> Result<(), Errno> {
    // SAFETY: nix does not have a truncate64 wrapper.
    Errno::result(path.with_nix_path(|cstr| unsafe { libc::truncate64(cstr.as_ptr(), len) })?)
        .map(drop)
}

/// Safe ftruncate64() wrapper.
pub fn ftruncate64<Fd: AsFd>(fd: Fd, len: off64_t) -> Result<(), Errno> {
    // SAFETY: nix does not have a ftruncate64 wrapper.
    Errno::result(unsafe { libc::ftruncate64(fd.as_fd().as_raw_fd(), len) }).map(drop)
}

// Description of one file extent.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fiemap_extent {
    // Byte offset of the extent in the file.
    fe_logical: u64,
    // Byte offset of the extent on disk.
    fe_physical: u64,
    // Length in bytes for this extent.
    fe_length: u64,
    _fe_reserved64: [u64; 2],
    // FIEMAP_EXTENT_* flags for this extent.
    fe_flags: u32,
    _fe_reserved: [u32; 3],
}

// File extent mappings.
//
// - `fm_start`: byte offset (inclusive) at which to start mapping (in)
// - `fm_length`: logical length of mapping which userspace wants (in)
// - `fm_flags`: FIEMAP_FLAG_* flags for request (in/out)
// - `fm_mapped_extents`: number of extents that were mapped (out)
// - `fm_extent_count`: size of `fm_extents` array (in)
// - `fm_reserved`: reserved
// - `fm_extents`: array of mapped extents (out)
#[repr(C)]
struct fiemap {
    fm_start: u64,
    fm_length: u64,
    fm_flags: u32,
    fm_mapped_extents: u32,
    fm_extent_count: u32,
    _fm_reserved: u32,
    // Flexible array of extents; actual length is `fm_extent_count`.
    fm_extents: [fiemap_extent; 0],
}

/// FS_IOC_FIEMAP ioctl(2) request.
pub const FS_IOC_FIEMAP: c_ulong = _IOWR::<fiemap>(b'f' as u32, 11) as c_ulong;

/// FIGETBSZ ioctl(2) request.
pub const FIGETBSZ: c_ulong = _IO(0x00, 2) as c_ulong;

// Information for a single dedupe operation on a destination file.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct file_dedupe_range_info {
    // Destination file descriptor.
    dest_fd: i64,
    // Start offset of the extent in the destination file.
    dest_offset: u64,
    // Number of bytes successfully deduped.
    bytes_deduped: u64,
    // Status of this dedupe operation:
    // < 0 for error,
    // == FILE_DEDUPE_RANGE_SAME if dedupe succeeds,
    // == FILE_DEDUPE_RANGE_DIFFERS if data differs.
    status: i32,
    // Must be zero.
    reserved: u32,
}

// Arguments for a range of dedupe operations from a source file.
//
// - `src_offset`: start of the extent in the source file (in)
// - `src_length`: length of the extent (in)
// - `dest_count`: number of elements in the `info` array (in)
// - `reserved1`: must be zero
// - `reserved2`: must be zero
// - `info`: array of `file_dedupe_range_info` of length `dest_count` (out)
#[repr(C)]
struct file_dedupe_range {
    src_offset: u64,
    src_length: u64,
    dest_count: u16,
    reserved1: u16,
    reserved2: u32,
    // Flexible array member of length `dest_count`.
    info: [file_dedupe_range_info; 0],
}

/// FIDEDUPERANGE ioctl(2) request.
pub const FIDEDUPERANGE: c_ulong = _IOWR::<file_dedupe_range>(0x94, 54) as c_ulong;

// Filesystem UUID structure with fixed 16-byte buffer.
//
// - `len`: actual length of the UUID (≤16)
// - `uuid`: UUID bytes
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fsuuid2 {
    len: u8,
    uuid: [u8; 16],
}

/// FS_IOC_GETFSUUID ioctl(2) request.
pub const FS_IOC_GETFSUUID: c_ulong = _IOR::<fsuuid2>(0x15, 0) as c_ulong;

// Filesystem sysfs path structure.
//
// - `len`: length of the returned name (≤128)
// - `name`: NUL-terminated path component under `/sys/fs/` or `/sys/kernel/debug/`
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fs_sysfs_path {
    len: u8,
    name: [u8; 128],
}

/// FS_IOC_GETFSSYSFSPATH ioctl(2) request.
pub const FS_IOC_GETFSSYSFSPATH: c_ulong = _IOR::<fs_sysfs_path>(0x15, 1) as c_ulong;

/// FIBMAP ioctl(2) request.
pub const FIBMAP: c_ulong = _IO(0x00, 1) as c_ulong;

/// KDSETKEYCODE ioctl(2) request.
pub const KDSETKEYCODE: c_ulong = 0x4B4D;

/// KDSIGACCEPT ioctl(2) request.
pub const KDSIGACCEPT: c_ulong = 0x4B4E;

// File system extended attribute operations.
//
// Used with `FS_IOC_FSGETXATTR` and `FS_IOC_FSSETXATTR`.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
struct fsxattr {
    // xflags field value (get/set)
    fsx_xflags: u32,
    // extsize field value (get/set)
    fsx_extsize: u32,
    // nextents field value (get)
    fsx_nextents: u32,
    // project identifier (get/set)
    fsx_projid: u32,
    // CoW extsize field value (get/set)
    fsx_cowextsize: u32,
    // Padding
    fsx_pad: [u8; 8],
}

/// FS_IOC_FSGETXATTR ioctl(2) request.
pub const FS_IOC_FSGETXATTR: c_ulong = _IOR::<fsxattr>(b'X' as u32, 31) as c_ulong;

/// FS_IOC_FSSETXATTR ioctl(2) request.
pub const FS_IOC_FSSETXATTR: c_ulong = _IOW::<fsxattr>(b'X' as u32, 32) as c_ulong;

/// FS_IOC_SETFLAGS ioctl(2) request.
pub const FS_IOC_SETFLAGS: c_ulong = _IOW::<c_long>(b'f' as u32, 2) as c_ulong;

/*
 * Seccomp constants
 */

/// '!' magic number for seccomp ioctls.
pub const SECCOMP_IOCTL_MAGIC: u32 = b'!' as u32;

/// SECCOMP_IOCTL_NOTIF_RECV ioctl(2) request.
pub const SECCOMP_IOCTL_NOTIF_RECV: c_ulong =
    _IOWR::<seccomp_notif>(SECCOMP_IOCTL_MAGIC, 0) as c_ulong;

/// SECCOMP_IOCTL_NOTIF_SEND ioctl(2) request.
pub const SECCOMP_IOCTL_NOTIF_SEND: c_ulong =
    _IOWR::<seccomp_notif_resp>(SECCOMP_IOCTL_MAGIC, 1) as c_ulong;

/// SECCOMP_IOCTL_NOTIF_ID_VALID ioctl(2) request.
pub const SECCOMP_IOCTL_NOTIF_ID_VALID: c_ulong = _IOW::<u64>(SECCOMP_IOCTL_MAGIC, 2) as c_ulong;

/// SECCOMP_IOCTL_NOTIF_ADDFD ioctl(2) request.
pub const SECCOMP_IOCTL_NOTIF_ADDFD: c_ulong =
    _IOW::<seccomp_notif_addfd>(SECCOMP_IOCTL_MAGIC, 3) as c_ulong;

/// SECCOMP_IOCTL_NOTIF_SET_FLAGS ioctl(2) request.
pub const SECCOMP_IOCTL_NOTIF_SET_FLAGS: c_ulong = _IOW::<u64>(SECCOMP_IOCTL_MAGIC, 4) as c_ulong;

pub(crate) const SECCOMP_IOCTL_NOTIF_LIST: &[c_ulong] = &[
    SECCOMP_IOCTL_NOTIF_RECV,
    SECCOMP_IOCTL_NOTIF_SEND,
    SECCOMP_IOCTL_NOTIF_ID_VALID,
    SECCOMP_IOCTL_NOTIF_ADDFD,
    SECCOMP_IOCTL_NOTIF_SET_FLAGS,
];

/// Flag to set synchronous mode for the seccomp notify fd.
pub(crate) const SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP: u32 = 1;

/// Export a seccomp filter in pretty-printed PFC.
///
/// Mostly equivalent to _seccomp_export_pfc(3) with
/// some convenience replacements for seccomp constants,
/// and returns a String rather than a FD.
///
/// The temporary file is created in `/tmp` directory
/// with O_TMPFILE|O_EXCL flags and empty/zero Mode.
pub(crate) fn seccomp_export_pfc(ctx: &ScmpFilterContext) -> Result<String, Errno> {
    // SAFETY: This runs at startup before confinement!
    #[expect(clippy::disallowed_methods)]
    let mut file = nix::fcntl::openat(
        AT_FDCWD,
        "/tmp",
        OFlag::O_TMPFILE | OFlag::O_EXCL | OFlag::O_RDWR,
        Mode::empty(),
    )
    .map(File::from)?;
    ctx.export_pfc(&mut file).or(Err(Errno::EFAULT))?;

    file.seek(SeekFrom::Start(0)).map_err(|err| err2no(&err))?;
    let mut buf = Vec::new();
    file.read_to_end(&mut buf).map_err(|err| err2no(&err))?;

    // from_utf8_lossy_to_owned() is nightly...
    let mut pfc = String::from_utf8_lossy(&buf).into_owned();
    for &(from, to) in &[
        ("0x7fc00000", "NOTIFY"),
        (
            &format!("{SECCOMP_IOCTL_NOTIF_RECV}"),
            "SECCOMP_IOCTL_NOTIF_RECV",
        ),
        (
            &format!("{SECCOMP_IOCTL_NOTIF_SEND}"),
            "SECCOMP_IOCTL_NOTIF_SEND",
        ),
        (
            &format!("{SECCOMP_IOCTL_NOTIF_ID_VALID}"),
            "SECCOMP_IOCTL_NOTIF_ID_VALID",
        ),
        (
            &format!("{SECCOMP_IOCTL_NOTIF_ADDFD}"),
            "SECCOMP_IOCTL_NOTIF_ADDFD",
        ),
        (
            &format!("{SECCOMP_IOCTL_NOTIF_SET_FLAGS}"),
            "SECCOMP_IOCTL_NOTIF_SET_FLAGS",
        ),
        (&format!("{PROCMAP_QUERY}"), "PROCMAP_QUERY"),
    ] {
        pfc = pfc.replace(from, to);
    }

    Ok(pfc)
}

/// Set seccomp notify fd flags, useful to set synchronous mode.
pub(crate) fn seccomp_notify_set_flags(fd: RawFd, flags: u32) -> Result<(), Errno> {
    if !*HAVE_SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP {
        return Err(Errno::ENOSYS);
    }

    retry_on_eintr(|| {
        // SAFETY: In libc we trust.
        Errno::result(unsafe {
            syscall(
                SYS_ioctl,
                fd,
                SECCOMP_IOCTL_NOTIF_SET_FLAGS as c_ulong,
                flags,
            )
        })
    })
    .map(drop)
}

/// Wrapper for SECCOMP_IOCTL_NOTIF_ID_VALID ioctl(2),
/// aka _seccomp_notify_id_valid_(3) of libseccomp.
pub(crate) fn seccomp_notify_id_valid(fd: RawFd, id: u64) -> Result<(), Errno> {
    retry_on_eintr(|| {
        // SAFETY: Validate request ID against the seccomp-notify fd:
        // 1. This function is a hot path where we don't want to run
        //    notify_supported() on each call.
        // 2. We want to reliably handle EAGAIN and EINTR.
        // 3. ENOENT means child died mid-way.
        // libseccomp::notify_id_valid(fd, id).is_ok().
        Errno::result(unsafe {
            syscall(SYS_ioctl, fd, SECCOMP_IOCTL_NOTIF_ID_VALID as c_ulong, &id)
        })
    })
    .map(drop)
}

/// Wrapper for SECCOMP_IOCTL_NOTIF_SEND ioctl(2),
/// aka _seccomp_notify_respond_(3) of libseccomp.
pub(crate) fn seccomp_notify_respond(
    fd: RawFd,
    response: *const seccomp_notif_resp,
) -> Result<(), Errno> {
    retry_on_eintr(|| {
        // SAFETY:
        // 1. libseccomp's version allocates needlessly, and
        // 2. libseccomp-sys's version requires a mutable pointer, and
        // 3. libseccomp does not export EINTR to user API, but returns EFAULT as catch-all:
        //    https://github.com/seccomp/libseccomp/blob/5491c4b931431bec489dd78247ef675fc1b49797/src/api.c#L92-L95
        //    So we use _ioctl_(2) directly.
        // 4. EINTR may mean `syd_int` thread misfired us.
        // 5. ENOENT means child-died mid-way.
        // 6. Ok() is all good!
        // 7. We protect SECCOMP_IOCTL_NOTIF_SEND with system call argument cookies,
        //    to raise the bar against an attacker who has compromised Syd and aims
        //    to inject the flag SECCOMP_USER_NOTIF_FLAG_CONTINUE to this response
        //    in order to pass-through a system call to the host Linux kernel.
        // 8. Randomizing the seccomp-fd at startup is another mitigation against this.
        Errno::result(unsafe {
            syscall(
                SYS_ioctl,
                fd,
                SECCOMP_IOCTL_NOTIF_SEND as c_ulong,
                response,
                SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifSendArg3),
                SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifSendArg4),
                SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifSendArg5),
            )
        })
    })
    .map(drop)
}

/// Wrapper for SECCOMP_IOCTL_NOTIF_ADDFD ioctl(2).
pub(crate) fn seccomp_notify_addfd(
    fd: RawFd,
    addfd: *const seccomp_notif_addfd,
) -> Result<RawFd, Errno> {
    #[expect(clippy::cast_possible_truncation)]
    retry_on_eintr(|| {
        // SAFETY:
        // 1. libseccomp has no wrapper for ADDFD yet, and
        // 2. libseccomp does not export EINTR to user API, but returns EFAULT as catch-all:
        //    https://github.com/seccomp/libseccomp/blob/5491c4b931431bec489dd78247ef675fc1b49797/src/api.c#L92-L95
        //    So we use _ioctl_(2) directly.
        // 3. EINTR may mean `syd_int` thread misfired us.
        // 4. ENOENT means child-died mid-way.
        // 5. Ok() is all good!
        // 6. We protect SECCOMP_IOCTL_NOTIF_ADDFD with system call argument cookies,
        //    to raise the bar against an attacker who has compromised Syd and aims
        //    steal file descriptors.
        // 7. Randomizing the seccomp-fd at startup is another mitigation against this.
        Errno::result(unsafe {
            syscall(
                SYS_ioctl,
                fd,
                SECCOMP_IOCTL_NOTIF_ADDFD as c_ulong,
                addfd,
                SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifAddfdArg3),
                SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifAddfdArg4),
                SYSCOOKIE_POOL.get(CookieIdx::SeccompIoctlNotifAddfdArg5),
            )
        })
    })
    .map(|fd| fd as RawFd)
}

/// Returns the access mode from the given `OFlag`.
pub fn oflag_accmode(flags: OFlag) -> OFlag {
    // 1. glibc does not include O_PATH to O_ACCMODE.
    // 2. musl defines O_PATH equal to O_EXEC and O_SEARCH,
    //    and O_ACCMODE is defined as O_ACCMODE|O_SEARCH.
    // Here we force the second behaviour by explicitly
    // adding O_PATH into O_ACCMODE. This works on both libcs.
    // See: https://www.openwall.com/lists/musl/2013/02/22/1
    flags & (OFlag::O_ACCMODE | OFlag::O_PATH)
}

/// Returns true if the open flags refer to nonblocking i/o.
pub fn oflag_nonblock(flags: OFlag) -> bool {
    !(flags & (OFlag::O_NONBLOCK | OFlag::O_NDELAY)).is_empty()
}

/// A safe version of clone that returns a PidFD,
/// and therefore is not subject to PID-recycling
/// races.
pub fn safe_clone(
    mut cb: CloneCb,
    stack: &mut [u8],
    flags: c_int,
    signal: Option<c_int>,
) -> Result<OwnedFd, Errno> {
    #[expect(clippy::cast_possible_truncation)]
    extern "C" fn callback(data: *mut CloneCb) -> c_int {
        // SAFETY: nix' version does not support CLONE_PIDFD.
        let cb: &mut CloneCb = unsafe { &mut *data };
        (*cb)() as c_int
    }

    let mut pid_fd: c_int = -1;
    let combined: c_int = flags | CLONE_PIDFD | signal.unwrap_or(0);
    // SAFETY: ditto.
    #[expect(clippy::missing_transmute_annotations)]
    let res = unsafe {
        let ptr = stack.as_mut_ptr().add(stack.len());
        let ptr_aligned = ptr.sub(ptr as usize % 16);
        clone(
            std::mem::transmute(callback as extern "C" fn(*mut Box<dyn FnMut() -> isize>) -> i32),
            ptr_aligned as *mut c_void,
            combined,
            std::ptr::addr_of_mut!(cb) as *mut c_void,
            &mut pid_fd,
        )
    };

    Errno::result(res).map(|_| {
        // SAFETY: clone with CLONE_PIDFD returns a valid FD.
        unsafe { OwnedFd::from_raw_fd(pid_fd) }
    })
}

/// libc may not define process_mrelease yet (e.g. musl on riscv64).
static SYS_PROCESS_MRELEASE: LazyLock<Option<c_long>> = LazyLock::new(|| {
    match ScmpSyscall::from_name("process_mrelease")
        .map(i32::from)
        .map(c_long::from)
        .ok()
    {
        Some(n) if n < 0 => None,
        Some(n) => Some(n),
        None => None,
    }
});

/// Safe wrapper for process_mrelease(2).
///
/// This function requires Linux 5.15+.
pub fn process_mrelease<Fd: AsFd>(pid_fd: Fd) -> Result<(), Errno> {
    let sysnum = SYS_PROCESS_MRELEASE.ok_or(Errno::ENOSYS)?;

    // SAFETY:
    // 1. libc does not have a wrapper for process_mrelease yet.
    // 2. libc may not define SYS_process_mrelease yet.
    Errno::result(unsafe { syscall(sysnum, pid_fd.as_fd().as_raw_fd(), 0) }).map(drop)
}

/// Safe wrapper for tgkill(2).
pub fn tgkill(tgid: Pid, tid: Pid, sig: i32) -> Result<(), Errno> {
    // SAFETY: There's no libc wrapper for tgkill.
    Errno::result(unsafe { syscall(SYS_tgkill, tgid.as_raw(), tid.as_raw(), sig) }).map(drop)
}

/// Safe wrapper for sigwaitinfo(2).
pub fn sigwaitinfo(set: &SigSet, info: Option<&mut siginfo_t>) -> Result<i32, Errno> {
    let info = info.map(|si| si as *mut _).unwrap_or(std::ptr::null_mut());

    // SAFETY: In libc we trust.
    Errno::result(unsafe { crate::compat::sigwaitinfo(set.as_ref(), info) })
}

/// Safe wrapper for sigtimedwait(2).
pub fn sigtimedwait(
    set: &SigSet,
    info: Option<&mut siginfo_t>,
    timeout: TimeSpec,
) -> Result<i32, Errno> {
    let info = info.map(|si| si as *mut _).unwrap_or(std::ptr::null_mut());

    // SAFETY: In libc we trust.
    Errno::result(unsafe { libc::sigtimedwait(set.as_ref(), info, timeout.as_ref()) })
}

/// Convenience wrapper for sigtimedwait with zero TimeSpec.
pub fn sigtimedpoll(set: &SigSet, info: Option<&mut siginfo_t>) -> Result<i32, Errno> {
    sigtimedwait(set, info, TimeSpec::new(0, 0))
}

/// Convenience wrapper to block a single Signal.
pub fn block_signal(sig: Signal) -> Result<(), Errno> {
    let mut mask = SigSet::empty();
    mask.add(sig);
    mask.thread_block()
}

/// Convenience wrapper to unblock a single Signal.
pub fn unblock_signal(sig: Signal) -> Result<(), Errno> {
    let mut mask = SigSet::empty();
    mask.add(sig);
    mask.thread_unblock()
}

/// Read a symbolic link and return a `XPathBuf`.
// TODO: Move to compat.rs
pub fn readlinkat<Fd: AsFd, P: NixPath + ?Sized>(fd: Fd, base: &P) -> Result<XPathBuf, Errno> {
    // Initialize target on the stack.
    let mut target = [0u8; PATH_MAX];

    let n = base.with_nix_path(|cstr| {
        // SAFETY: We don't want nix' extra handling around readlink here.
        #[expect(clippy::cast_sign_loss)]
        Errno::result(unsafe {
            libc::readlinkat(
                fd.as_fd().as_raw_fd(),
                cstr.as_ptr(),
                target.as_mut_ptr().cast(),
                target.len(),
            )
        })
        .map(|n| n as usize)
    })??;

    if n > target.len() {
        // Truncation occurred!
        return Err(Errno::ENAMETOOLONG);
    }

    Ok(target[..n].into())
}

/// Read a symbolic link from FD and return a `XPathBuf`.
pub fn readlinkfd<Fd: AsFd>(fd: Fd) -> Result<XPathBuf, Errno> {
    readlinkat(fd, c"").map_err(|errno| {
        // FD-only readlinkat(2) returns ENOENT,
        // when FD is not a symbolic link.
        if errno == Errno::ENOENT {
            Errno::EINVAL
        } else {
            errno
        }
    })
}

/// Create file and write the given content.
#[expect(clippy::disallowed_methods)]
pub fn cat<P: AsRef<Path>, T: AsRef<[u8]>>(path: P, content: T) -> std::io::Result<()> {
    let mut file = File::create(path)?;
    file.write_all(content.as_ref())?;
    Ok(())
}

/// Make a file executable.
pub fn chmod_x<P: AsRef<Path>>(path: P) -> std::io::Result<()> {
    // Set permissions to make path executable.
    let metadata = metadata(path.as_ref())?;
    let mut permissions = metadata.permissions();
    permissions.set_mode(0o700); // This sets the file executable for the owner (rwx).
    set_permissions(path.as_ref(), permissions)
}

/// Format open(2) flags for serialization.
pub fn format_oflags(flags: OFlag) -> Vec<String> {
    let count = flags.into_iter().count();
    if count == 0 {
        return vec![];
    }

    let mut fmt = Vec::with_capacity(count);
    for flag in flags.iter() {
        fmt.push(format_oflag(flag));
    }

    fmt
}

/// Format a single open(2) flag for serialization.
pub fn format_oflag(flag: OFlag) -> String {
    let flag = format!("{flag:?}");

    if !flag.starts_with("OFlag(O_") || !flag.ends_with(')') {
        return "?".to_string();
    }

    #[expect(clippy::arithmetic_side_effects)]
    String::from_utf8_lossy(&flag.as_bytes()[8..flag.len() - 1]).to_ascii_lowercase()
}

/// Format unshare(2) flags for serialization.
pub fn format_clone_flags(flags: CloneFlags) -> Vec<&'static str> {
    let mut names = vec![];

    if flags.is_empty() {
        return names;
    }
    if flags.contains(CloneFlags::CLONE_NEWUSER) {
        names.push("user");
    }
    if flags.contains(CloneFlags::CLONE_NEWNS) {
        names.push("mount");
    }
    if flags.contains(CloneFlags::CLONE_NEWUTS) {
        names.push("uts");
    }
    if flags.contains(CloneFlags::CLONE_NEWIPC) {
        names.push("ipc");
    }
    if flags.contains(CloneFlags::CLONE_NEWPID) {
        names.push("pid");
    }
    if flags.contains(CloneFlags::CLONE_NEWNET) {
        names.push("net");
    }
    if flags.contains(CloneFlags::CLONE_NEWCGROUP) {
        names.push("cgroup");
    }
    if flags.contains(CLONE_NEWTIME) {
        names.push("time");
    }

    names
}

/// Format the return vector from `format_clone_flags` into a `String`.
pub fn format_clone_names(clone_names: &[&str]) -> String {
    match clone_names.len() {
        0 => "no namespaces".to_string(),
        1 => format!("{} namespace", clone_names[0]),
        2 => format!("{} and {} namespaces", clone_names[0], clone_names[1]),
        _ => {
            let mut s = clone_names.join(", ");
            #[expect(clippy::arithmetic_side_effects)]
            if let Some(pos) = s.rfind(", ") {
                s.replace_range(pos..pos + 2, ", and ");
            }
            format!("{s} namespaces")
        }
    }
}

/// Searches for a name within a directory.
///
/// `name` is matched literally and exactly against file names.
/// Directory entries are appended a trailing slash before matching.
/// Symlink entries are appended an `@` character before matching.
#[expect(clippy::disallowed_methods)]
pub fn grep(dir: &XPath, name: &[u8]) -> Option<XPathBuf> {
    let dir = File::open(dir.as_path()).ok()?;
    let name = XPath::from_bytes(name);
    loop {
        let mut entries = getdents64(&dir, 128).ok()?;
        for entry in &mut entries {
            let mut path = XPathBuf::from(entry.name_bytes());
            // Append a trailing slash for directories.
            if entry.is_dir() {
                path.append_byte(b'/');
            } else if entry.is_symlink() {
                path.append_byte(b'@');
            } else if entry.is_block_device() {
                path.append_byte(b'!');
            } else if entry.is_char_device() {
                path.append_byte(b'$');
            } else if entry.is_fifo() {
                path.append_byte(b'|');
            } else if entry.is_socket() {
                path.append_byte(b'~');
            }
            if *path == *name || (name.len() == 1 && path.ends_with(name.as_bytes())) {
                return Some(path);
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_base_offset_root_and_non_root() {
        // Mirrors the computation used in CanonicalPath::new.
        let off = |parent_len: usize| parent_len + usize::from(parent_len > 1);

        assert_eq!(off(1), 1, "root parent must not drop first byte");
        assert_eq!(off(5), 6, "non-root parent must skip one separator");
    }

    #[test]
    fn test_oflag_rdonly_is_empty() {
        let mut flags = OFlag::empty();
        flags.insert(OFlag::O_RDONLY);
        assert!(flags.is_empty());
    }

    #[test]
    fn test_oflag_accmode() {
        assert_eq!(oflag_accmode(OFlag::empty()), OFlag::O_RDONLY);
        assert_eq!(oflag_accmode(OFlag::O_RDONLY), OFlag::O_RDONLY);
        assert_eq!(oflag_accmode(OFlag::O_WRONLY), OFlag::O_WRONLY);
        assert_eq!(oflag_accmode(OFlag::O_RDWR), OFlag::O_RDWR);
        assert_eq!(oflag_accmode(OFlag::O_PATH), OFlag::O_PATH);

        assert_eq!(
            oflag_accmode(OFlag::empty() | OFlag::O_APPEND),
            OFlag::O_RDONLY
        );
        assert_eq!(
            oflag_accmode(OFlag::O_RDONLY | OFlag::O_ASYNC),
            OFlag::O_RDONLY
        );
        assert_eq!(
            oflag_accmode(OFlag::O_WRONLY | OFlag::O_CREAT | OFlag::O_EXCL | OFlag::O_TRUNC),
            OFlag::O_WRONLY
        );
        assert_eq!(
            oflag_accmode(OFlag::O_RDWR | OFlag::O_CLOEXEC | OFlag::O_DIRECTORY),
            OFlag::O_RDWR
        );
        assert_eq!(
            oflag_accmode(OFlag::O_PATH | OFlag::O_NOFOLLOW),
            OFlag::O_PATH
        );
    }
}
