mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-04-02 12:47:41 +08:00
Merge patch series "introduce PIDFD_SELF* sentinels"
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> says: If you wish to utilise a pidfd interface to refer to the current process or thread then there is a lot of ceremony involved, looking something like: pid_t pid = getpid(); int pidfd = pidfd_open(pid, PIDFD_THREAD); if (pidfd < 0) { ... error handling ... } if (process_madvise(pidfd, iovec, 8, MADV_GUARD_INSTALL, 0)) { ... cleanup pidfd ... ... error handling ... } ... ... cleanup pidfd ... This adds unnecessary overhead + system calls, complicated error handling and in addition pidfd_open() is subject to RLIMIT_NOFILE (i.e. the limit of per-process number of open file descriptors), so the call may fail spuriously on this basis. Rather than doing this we can make use of sentinels for this purpose which can be passed as the pidfd instead. This looks like: if (process_madvise(PIDFD_SELF, iovec, 8, MADV_GUARD_INSTALL, 0)) { ... error handling ... } And avoids all of the aforementioned issues. This series introduces such sentinels. It is useful to refer to both the current thread from the userland's perspective for which we use PIDFD_SELF, and the current process from the userland's perspective, for which we use PIDFD_SELF_PROCESS. There is unfortunately some confusion between the kernel and userland as to what constitutes a process - a thread from the userland perspective is a process in userland, and a userland process is a thread group (more specifically the thread group leader from the kernel perspective). We therefore alias things thusly: * PIDFD_SELF_THREAD aliased by PIDFD_SELF - use PIDTYPE_PID. * PIDFD_SELF_THREAD_GROUP alised by PIDFD_SELF_PROCESS - use PIDTYPE_TGID. In all of the kernel code we refer to PIDFD_SELF_THREAD and PIDFD_SELF_THREAD_GROUP. However we expect users to use PIDFD_SELF and PIDFD_SELF_PROCESS. This matters for cases where, for instance, a user unshare()'s FDs or does thread-specific signal handling and where the user would be hugely confused if the FDs referenced or signal processed referred to the thread group leader rather than the individual thread. Another use-case comes from Android. When installing multiple MADV_GUARD_INSTALL guard pages they considered caching the result of pidfd_open() for reuse. That however wouldn't work in shared libraries where users can fork() in between such calls. For now we only adjust pidfd_get_task() and the pidfd_send_signal() system call with specific handling for this, implementing this functionality for process_madvise(), process_mrelease() (albeit, using it here wouldn't really make sense) and pidfd_send_signal(). We defer making further changes, as this would require a significant rework of the pidfd mechanism. The motivating case here is to support PIDFD_SELF in process_madvise(), so this suffices for immediate uses. Moving forward, this can be further expanded to other uses. * patches from https://lore.kernel.org/r/cover.1738268370.git.lorenzo.stoakes@oracle.com: selftests/mm: use PIDFD_SELF in guard pages test selftests/pidfd: add tests for PIDFD_SELF_* selftests/pidfd: add new PIDFD_SELF* defines pidfd: add PIDFD_SELF* sentinels to refer to own thread/process Link: https://lore.kernel.org/r/cover.1738268370.git.lorenzo.stoakes@oracle.com Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
@@ -23,6 +23,30 @@
|
||||
|
||||
#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
|
||||
|
||||
/*
|
||||
* The concept of process and threads in userland and the kernel is a confusing
|
||||
* one - within the kernel every thread is a 'task' with its own individual PID,
|
||||
* however from userland's point of view threads are grouped by a single PID,
|
||||
* which is that of the 'thread group leader', typically the first thread
|
||||
* spawned.
|
||||
*
|
||||
* To cut the Gideon knot, for internal kernel usage, we refer to
|
||||
* PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
|
||||
* perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
|
||||
* group leader...
|
||||
*/
|
||||
#define PIDFD_SELF_THREAD -10000 /* Current thread. */
|
||||
#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */
|
||||
|
||||
/*
|
||||
* ...and for userland we make life simpler - PIDFD_SELF refers to the current
|
||||
* thread, PIDFD_SELF_PROCESS refers to the process thread group leader.
|
||||
*
|
||||
* For nearly all practical uses, a user will want to use PIDFD_SELF.
|
||||
*/
|
||||
#define PIDFD_SELF PIDFD_SELF_THREAD
|
||||
#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP
|
||||
|
||||
struct pidfd_info {
|
||||
/*
|
||||
* This mask is similar to the request_mask in statx(2).
|
||||
|
||||
24
kernel/pid.c
24
kernel/pid.c
@@ -564,15 +564,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
|
||||
*/
|
||||
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
|
||||
{
|
||||
unsigned int f_flags;
|
||||
unsigned int f_flags = 0;
|
||||
struct pid *pid;
|
||||
struct task_struct *task;
|
||||
enum pid_type type;
|
||||
|
||||
pid = pidfd_get_pid(pidfd, &f_flags);
|
||||
if (IS_ERR(pid))
|
||||
return ERR_CAST(pid);
|
||||
switch (pidfd) {
|
||||
case PIDFD_SELF_THREAD:
|
||||
type = PIDTYPE_PID;
|
||||
pid = get_task_pid(current, type);
|
||||
break;
|
||||
case PIDFD_SELF_THREAD_GROUP:
|
||||
type = PIDTYPE_TGID;
|
||||
pid = get_task_pid(current, type);
|
||||
break;
|
||||
default:
|
||||
pid = pidfd_get_pid(pidfd, &f_flags);
|
||||
if (IS_ERR(pid))
|
||||
return ERR_CAST(pid);
|
||||
type = PIDTYPE_TGID;
|
||||
break;
|
||||
}
|
||||
|
||||
task = get_pid_task(pid, PIDTYPE_TGID);
|
||||
task = get_pid_task(pid, type);
|
||||
put_pid(pid);
|
||||
if (!task)
|
||||
return ERR_PTR(-ESRCH);
|
||||
|
||||
117
kernel/signal.c
117
kernel/signal.c
@@ -4009,56 +4009,12 @@ static struct pid *pidfd_to_pid(const struct file *file)
|
||||
(PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
|
||||
PIDFD_SIGNAL_PROCESS_GROUP)
|
||||
|
||||
/**
|
||||
* sys_pidfd_send_signal - Signal a process through a pidfd
|
||||
* @pidfd: file descriptor of the process
|
||||
* @sig: signal to send
|
||||
* @info: signal info
|
||||
* @flags: future flags
|
||||
*
|
||||
* Send the signal to the thread group or to the individual thread depending
|
||||
* on PIDFD_THREAD.
|
||||
* In the future extension to @flags may be used to override the default scope
|
||||
* of @pidfd.
|
||||
*
|
||||
* Return: 0 on success, negative errno on failure
|
||||
*/
|
||||
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
|
||||
siginfo_t __user *, info, unsigned int, flags)
|
||||
static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type,
|
||||
siginfo_t __user *info, unsigned int flags)
|
||||
{
|
||||
int ret;
|
||||
struct pid *pid;
|
||||
kernel_siginfo_t kinfo;
|
||||
enum pid_type type;
|
||||
|
||||
/* Enforce flags be set to 0 until we add an extension. */
|
||||
if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
|
||||
return -EINVAL;
|
||||
|
||||
/* Ensure that only a single signal scope determining flag is set. */
|
||||
if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
|
||||
return -EINVAL;
|
||||
|
||||
CLASS(fd, f)(pidfd);
|
||||
if (fd_empty(f))
|
||||
return -EBADF;
|
||||
|
||||
/* Is this a pidfd? */
|
||||
pid = pidfd_to_pid(fd_file(f));
|
||||
if (IS_ERR(pid))
|
||||
return PTR_ERR(pid);
|
||||
|
||||
if (!access_pidfd_pidns(pid))
|
||||
return -EINVAL;
|
||||
|
||||
switch (flags) {
|
||||
case 0:
|
||||
/* Infer scope from the type of pidfd. */
|
||||
if (fd_file(f)->f_flags & PIDFD_THREAD)
|
||||
type = PIDTYPE_PID;
|
||||
else
|
||||
type = PIDTYPE_TGID;
|
||||
break;
|
||||
case PIDFD_SIGNAL_THREAD:
|
||||
type = PIDTYPE_PID;
|
||||
break;
|
||||
@@ -4071,6 +4027,8 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
|
||||
}
|
||||
|
||||
if (info) {
|
||||
int ret;
|
||||
|
||||
ret = copy_siginfo_from_user_any(&kinfo, info);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
@@ -4088,8 +4046,71 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
|
||||
|
||||
if (type == PIDTYPE_PGID)
|
||||
return kill_pgrp_info(sig, &kinfo, pid);
|
||||
else
|
||||
return kill_pid_info_type(sig, &kinfo, pid, type);
|
||||
|
||||
return kill_pid_info_type(sig, &kinfo, pid, type);
|
||||
}
|
||||
|
||||
/**
|
||||
* sys_pidfd_send_signal - Signal a process through a pidfd
|
||||
* @pidfd: file descriptor of the process
|
||||
* @sig: signal to send
|
||||
* @info: signal info
|
||||
* @flags: future flags
|
||||
*
|
||||
* Send the signal to the thread group or to the individual thread depending
|
||||
* on PIDFD_THREAD.
|
||||
* In the future extension to @flags may be used to override the default scope
|
||||
* of @pidfd.
|
||||
*
|
||||
* Return: 0 on success, negative errno on failure
|
||||
*/
|
||||
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
|
||||
siginfo_t __user *, info, unsigned int, flags)
|
||||
{
|
||||
struct pid *pid;
|
||||
enum pid_type type;
|
||||
|
||||
/* Enforce flags be set to 0 until we add an extension. */
|
||||
if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
|
||||
return -EINVAL;
|
||||
|
||||
/* Ensure that only a single signal scope determining flag is set. */
|
||||
if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
|
||||
return -EINVAL;
|
||||
|
||||
switch (pidfd) {
|
||||
case PIDFD_SELF_THREAD:
|
||||
pid = get_task_pid(current, PIDTYPE_PID);
|
||||
type = PIDTYPE_PID;
|
||||
break;
|
||||
case PIDFD_SELF_THREAD_GROUP:
|
||||
pid = get_task_pid(current, PIDTYPE_TGID);
|
||||
type = PIDTYPE_TGID;
|
||||
break;
|
||||
default: {
|
||||
CLASS(fd, f)(pidfd);
|
||||
if (fd_empty(f))
|
||||
return -EBADF;
|
||||
|
||||
/* Is this a pidfd? */
|
||||
pid = pidfd_to_pid(fd_file(f));
|
||||
if (IS_ERR(pid))
|
||||
return PTR_ERR(pid);
|
||||
|
||||
if (!access_pidfd_pidns(pid))
|
||||
return -EINVAL;
|
||||
|
||||
/* Infer scope from the type of pidfd. */
|
||||
if (fd_file(f)->f_flags & PIDFD_THREAD)
|
||||
type = PIDTYPE_PID;
|
||||
else
|
||||
type = PIDTYPE_TGID;
|
||||
|
||||
return do_pidfd_send_signal(pid, sig, type, info, flags);
|
||||
}
|
||||
}
|
||||
|
||||
return do_pidfd_send_signal(pid, sig, type, info, flags);
|
||||
}
|
||||
|
||||
static int
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
#include <sys/uio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "../pidfd/pidfd.h"
|
||||
|
||||
/*
|
||||
* Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1:
|
||||
*
|
||||
@@ -50,11 +52,6 @@ static void handle_fatal(int c)
|
||||
siglongjmp(signal_jmp_buf, c);
|
||||
}
|
||||
|
||||
static int pidfd_open(pid_t pid, unsigned int flags)
|
||||
{
|
||||
return syscall(SYS_pidfd_open, pid, flags);
|
||||
}
|
||||
|
||||
static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec,
|
||||
size_t n, int advice, unsigned int flags)
|
||||
{
|
||||
@@ -370,14 +367,10 @@ TEST_F(guard_pages, multi_vma)
|
||||
TEST_F(guard_pages, process_madvise)
|
||||
{
|
||||
const unsigned long page_size = self->page_size;
|
||||
pid_t pid = getpid();
|
||||
int pidfd = pidfd_open(pid, 0);
|
||||
char *ptr_region, *ptr1, *ptr2, *ptr3;
|
||||
ssize_t count;
|
||||
struct iovec vec[6];
|
||||
|
||||
ASSERT_NE(pidfd, -1);
|
||||
|
||||
/* Reserve region to map over. */
|
||||
ptr_region = mmap(NULL, 100 * page_size, PROT_NONE,
|
||||
MAP_ANON | MAP_PRIVATE, -1, 0);
|
||||
@@ -425,7 +418,7 @@ TEST_F(guard_pages, process_madvise)
|
||||
ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0);
|
||||
|
||||
/* Now guard in one step. */
|
||||
count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0);
|
||||
count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_INSTALL, 0);
|
||||
|
||||
/* OK we don't have permission to do this, skip. */
|
||||
if (count == -1 && errno == EPERM)
|
||||
@@ -446,7 +439,7 @@ TEST_F(guard_pages, process_madvise)
|
||||
ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size]));
|
||||
|
||||
/* Now do the same with unguard... */
|
||||
count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0);
|
||||
count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_REMOVE, 0);
|
||||
|
||||
/* ...and everything should now succeed. */
|
||||
|
||||
@@ -463,7 +456,6 @@ TEST_F(guard_pages, process_madvise)
|
||||
ASSERT_EQ(munmap(ptr1, 10 * page_size), 0);
|
||||
ASSERT_EQ(munmap(ptr2, 5 * page_size), 0);
|
||||
ASSERT_EQ(munmap(ptr3, 20 * page_size), 0);
|
||||
close(pidfd);
|
||||
}
|
||||
|
||||
/* Assert that unmapping ranges does not leave guard markers behind. */
|
||||
|
||||
@@ -50,6 +50,22 @@
|
||||
#define PIDFD_NONBLOCK O_NONBLOCK
|
||||
#endif
|
||||
|
||||
#ifndef PIDFD_SELF_THREAD
|
||||
#define PIDFD_SELF_THREAD -10000 /* Current thread. */
|
||||
#endif
|
||||
|
||||
#ifndef PIDFD_SELF_THREAD_GROUP
|
||||
#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */
|
||||
#endif
|
||||
|
||||
#ifndef PIDFD_SELF
|
||||
#define PIDFD_SELF PIDFD_SELF_THREAD
|
||||
#endif
|
||||
|
||||
#ifndef PIDFD_SELF_PROCESS
|
||||
#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
|
||||
* That means, when it wraps around any pid < 300 will be skipped.
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
#define PIDFD_INFO_CGROUPID (1UL << 0)
|
||||
|
||||
struct pidfd_info {
|
||||
__u64 request_mask;
|
||||
__u64 mask;
|
||||
__u64 cgroupid;
|
||||
__u32 pid;
|
||||
__u32 tgid;
|
||||
@@ -148,7 +148,7 @@ out:
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct pidfd_info info = {
|
||||
.request_mask = PIDFD_INFO_CGROUPID,
|
||||
.mask = PIDFD_INFO_CGROUPID,
|
||||
};
|
||||
int pidfd = -1, ret = 1;
|
||||
pid_t pid;
|
||||
@@ -227,7 +227,7 @@ int main(int argc, char **argv)
|
||||
getegid(), info.sgid);
|
||||
goto on_error;
|
||||
}
|
||||
if ((info.request_mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) {
|
||||
if ((info.mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) {
|
||||
ksft_print_msg("cgroupid should not be 0 when PIDFD_INFO_CGROUPID is set\n");
|
||||
goto on_error;
|
||||
}
|
||||
|
||||
@@ -42,12 +42,41 @@ static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *))
|
||||
#endif
|
||||
}
|
||||
|
||||
static int signal_received;
|
||||
static pthread_t signal_received;
|
||||
|
||||
static void set_signal_received_on_sigusr1(int sig)
|
||||
{
|
||||
if (sig == SIGUSR1)
|
||||
signal_received = 1;
|
||||
signal_received = pthread_self();
|
||||
}
|
||||
|
||||
static int send_signal(int pidfd)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0) < 0) {
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (signal_received != pthread_self()) {
|
||||
ret = -EINVAL;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
exit:
|
||||
signal_received = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *send_signal_worker(void *arg)
|
||||
{
|
||||
int pidfd = (int)(intptr_t)arg;
|
||||
int ret;
|
||||
|
||||
/* We forward any errors for the caller to handle. */
|
||||
ret = send_signal(pidfd);
|
||||
return (void *)(intptr_t)ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -56,8 +85,11 @@ static void set_signal_received_on_sigusr1(int sig)
|
||||
*/
|
||||
static int test_pidfd_send_signal_simple_success(void)
|
||||
{
|
||||
int pidfd, ret;
|
||||
int pidfd;
|
||||
const char *test_name = "pidfd_send_signal send SIGUSR1";
|
||||
pthread_t thread;
|
||||
void *thread_res;
|
||||
int err;
|
||||
|
||||
if (!have_pidfd_send_signal) {
|
||||
ksft_test_result_skip(
|
||||
@@ -66,25 +98,45 @@ static int test_pidfd_send_signal_simple_success(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
signal(SIGUSR1, set_signal_received_on_sigusr1);
|
||||
|
||||
/* Try sending a signal to ourselves via /proc/self. */
|
||||
pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
|
||||
if (pidfd < 0)
|
||||
ksft_exit_fail_msg(
|
||||
"%s test: Failed to open process file descriptor\n",
|
||||
test_name);
|
||||
|
||||
signal(SIGUSR1, set_signal_received_on_sigusr1);
|
||||
|
||||
ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0);
|
||||
err = send_signal(pidfd);
|
||||
if (err)
|
||||
ksft_exit_fail_msg(
|
||||
"%s test: Error %d on sending pidfd signal\n",
|
||||
test_name, err);
|
||||
close(pidfd);
|
||||
if (ret < 0)
|
||||
ksft_exit_fail_msg("%s test: Failed to send signal\n",
|
||||
test_name);
|
||||
|
||||
if (signal_received != 1)
|
||||
ksft_exit_fail_msg("%s test: Failed to receive signal\n",
|
||||
test_name);
|
||||
/* Now try the same thing only using PIDFD_SELF_THREAD_GROUP. */
|
||||
err = send_signal(PIDFD_SELF_THREAD_GROUP);
|
||||
if (err)
|
||||
ksft_exit_fail_msg(
|
||||
"%s test: Error %d on PIDFD_SELF_THREAD_GROUP signal\n",
|
||||
test_name, err);
|
||||
|
||||
/*
|
||||
* Now try the same thing in a thread and assert thread ID is equal to
|
||||
* worker thread ID.
|
||||
*/
|
||||
if (pthread_create(&thread, NULL, send_signal_worker,
|
||||
(void *)(intptr_t)PIDFD_SELF_THREAD))
|
||||
ksft_exit_fail_msg("%s test: Failed to create thread\n",
|
||||
test_name);
|
||||
if (pthread_join(thread, &thread_res))
|
||||
ksft_exit_fail_msg("%s test: Failed to join thread\n",
|
||||
test_name);
|
||||
err = (int)(intptr_t)thread_res;
|
||||
if (err)
|
||||
ksft_exit_fail_msg(
|
||||
"%s test: Error %d on PIDFD_SELF_THREAD signal\n",
|
||||
test_name, err);
|
||||
|
||||
signal_received = 0;
|
||||
ksft_test_result_pass("%s test: Sent signal\n", test_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user