Merge tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar:
 "Core perf code updates:

   - Convert mmap() related reference counts to refcount_t. This is in
     reaction to the recently fixed refcount bugs, which could have been
     detected earlier and could have mitigated the bug somewhat (Thomas
     Gleixner, Peter Zijlstra)

   - Clean up and simplify the callchain code, in preparation for
     sframes (Steven Rostedt, Josh Poimboeuf)

  Uprobes updates:

   - Add support to optimize usdt probes on x86-64, which gives a
     substantial speedup (Jiri Olsa)

   - Cleanups and fixes on x86 (Peter Zijlstra)

  PMU driver updates:

   - Various optimizations and fixes to the Intel PMU driver (Dapeng Mi)

  Misc cleanups and fixes:

   - Remove redundant __GFP_NOWARN (Qianfeng Rong)"

* tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
  selftests/bpf: Fix uprobe_sigill test for uprobe syscall error value
  uprobes/x86: Return error from uprobe syscall when not called from trampoline
  perf: Skip user unwind if the task is a kernel thread
  perf: Simplify get_perf_callchain() user logic
  perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL
  perf: Have get_perf_callchain() return NULL if crosstask and user are set
  perf: Remove get_perf_callchain() init_nr argument
  perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap()
  perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK
  perf/x86/intel: Change macro GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48)
  perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag
  perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error
  perf/x86/intel: Use early_initcall() to hook bts_init()
  uprobes: Remove redundant __GFP_NOWARN
  selftests/seccomp: validate uprobe syscall passes through seccomp
  seccomp: passthrough uprobe systemcall without filtering
  selftests/bpf: Fix uprobe syscall shadow stack test
  selftests/bpf: Change test_uretprobe_regs_change for uprobe and uretprobe
  selftests/bpf: Add uprobe_regs_equal test
  selftests/bpf: Add optimized usdt variant for basic usdt test
  ...
This commit is contained in:
Linus Torvalds
2025-09-30 11:11:21 -07:00
32 changed files with 2262 additions and 416 deletions

View File

@@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
@@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {

View File

@@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
}
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
int rctx, start_entry_idx;
/* crosstask is not supported for user stacks */
if (crosstask && user && !kernel)
return NULL;
entry = get_callchain_entry(&rctx);
if (!entry)
return NULL;
ctx.entry = entry;
ctx.max_stack = max_stack;
ctx.nr = entry->nr = init_nr;
ctx.contexts = 0;
ctx.contexts_maxed = false;
ctx.entry = entry;
ctx.max_stack = max_stack;
ctx.nr = entry->nr = 0;
ctx.contexts = 0;
ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
@@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
perf_callchain_kernel(&ctx, regs);
}
if (user) {
if (user && !crosstask) {
if (!user_mode(regs)) {
if (current->mm)
regs = task_pt_regs(current);
else
regs = NULL;
}
if (regs) {
if (crosstask)
if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
start_entry_idx = entry->nr;
perf_callchain_user(&ctx, regs);
fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
regs = task_pt_regs(current);
}
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
start_entry_idx = entry->nr;
perf_callchain_user(&ctx, regs);
fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
exit_put:

View File

@@ -3974,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
*/
static inline bool event_update_userpage(struct perf_event *event)
{
if (likely(!atomic_read(&event->mmap_count)))
if (likely(!refcount_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
@@ -6710,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
mapped_f mapped = get_mapped(event, event_mapped);
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
refcount_inc(&event->mmap_count);
refcount_inc(&event->rb->mmap_count);
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
refcount_inc(&event->rb->aux_mmap_count);
if (mapped)
mapped(event, vma->vm_mm);
@@ -6749,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
* to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@@ -6769,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&rb->aux_mutex);
}
if (atomic_dec_and_test(&rb->mmap_count))
if (refcount_dec_and_test(&rb->mmap_count))
detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
ring_buffer_attach(event, NULL);
@@ -6933,19 +6933,200 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
return err;
}
static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
{
unsigned long user_locked, user_lock_limit, locked, lock_limit;
struct user_struct *user = current_user();
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/* Increase the limit linearly with more CPUs */
user_lock_limit *= num_online_cpus();
user_locked = atomic_long_read(&user->locked_vm);
/*
* sysctl_perf_event_mlock may have changed, so that
* user->locked_vm > user_lock_limit
*/
if (user_locked > user_lock_limit)
user_locked = user_lock_limit;
user_locked += *user_extra;
if (user_locked > user_lock_limit) {
/*
* charge locked_vm until it hits user_lock_limit;
* charge the rest from pinned_vm
*/
*extra = user_locked - user_lock_limit;
*user_extra -= *extra;
}
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
}
static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
{
struct user_struct *user = current_user();
atomic_long_add(user_extra, &user->locked_vm);
atomic64_add(extra, &vma->vm_mm->pinned_vm);
}
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
long extra = 0, user_extra = nr_pages;
struct perf_buffer *rb;
int rb_flags = 0;
nr_pages -= 1;
/*
* If we have rb pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
*/
if (nr_pages != 0 && !is_power_of_2(nr_pages))
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
if (event->rb) {
if (data_page_nr(event->rb) != nr_pages)
return -EINVAL;
if (refcount_inc_not_zero(&event->rb->mmap_count)) {
/*
* Success -- managed to mmap() the same buffer
* multiple times.
*/
perf_mmap_account(vma, user_extra, extra);
refcount_inc(&event->mmap_count);
return 0;
}
/*
* Raced against perf_mmap_close()'s
* refcount_dec_and_mutex_lock() remove the
* event and continue as if !event->rb
*/
ring_buffer_attach(event, NULL);
}
if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
return -EPERM;
if (vma->vm_flags & VM_WRITE)
rb_flags |= RING_BUFFER_WRITABLE;
rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, rb_flags);
if (!rb)
return -ENOMEM;
refcount_set(&rb->mmap_count, 1);
rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
perf_mmap_account(vma, user_extra, extra);
refcount_set(&event->mmap_count, 1);
return 0;
}
static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
long extra = 0, user_extra = nr_pages;
u64 aux_offset, aux_size;
struct perf_buffer *rb;
int ret, rb_flags = 0;
rb = event->rb;
if (!rb)
return -EINVAL;
guard(mutex)(&rb->aux_mutex);
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
* mapped, all subsequent mappings should have the same size
* and offset. Must be above the normal perf buffer.
*/
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
return -EINVAL;
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
return -EINVAL;
/* already mapped with a different offset */
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
return -EINVAL;
if (aux_size != nr_pages * PAGE_SIZE)
return -EINVAL;
/* already mapped with a different size */
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
return -EINVAL;
if (!is_power_of_2(nr_pages))
return -EINVAL;
if (!refcount_inc_not_zero(&rb->mmap_count))
return -EINVAL;
if (rb_has_aux(rb)) {
refcount_inc(&rb->aux_mmap_count);
} else {
if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
refcount_dec(&rb->mmap_count);
return -EPERM;
}
WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
rb_flags |= RING_BUFFER_WRITABLE;
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, rb_flags);
if (ret) {
refcount_dec(&rb->mmap_count);
return ret;
}
refcount_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
}
perf_mmap_account(vma, user_extra, extra);
refcount_inc(&event->mmap_count);
return 0;
}
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
struct mutex *aux_mutex = NULL;
struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
int ret, flags = 0;
unsigned long vma_size, nr_pages;
mapped_f mapped;
int ret;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6971,192 +7152,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (vma_size != PAGE_SIZE * nr_pages)
return -EINVAL;
user_extra = nr_pages;
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
/*
* This relies on __pmu_detach_event() taking mmap_mutex after marking
* the event REVOKED. Either we observe the state, or __pmu_detach_event()
* will detach the rb created here.
*/
if (event->state <= PERF_EVENT_STATE_REVOKED) {
ret = -ENODEV;
goto unlock;
}
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
scoped_guard (mutex, &event->mmap_mutex) {
/*
* If we have rb pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
* This relies on __pmu_detach_event() taking mmap_mutex after marking
* the event REVOKED. Either we observe the state, or __pmu_detach_event()
* will detach the rb created here.
*/
if (nr_pages != 0 && !is_power_of_2(nr_pages))
goto unlock;
if (event->state <= PERF_EVENT_STATE_REVOKED)
return -ENODEV;
WARN_ON_ONCE(event->ctx->parent_ctx);
if (event->rb) {
if (data_page_nr(event->rb) != nr_pages)
goto unlock;
if (atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
* Success -- managed to mmap() the same buffer
* multiple times.
*/
ret = 0;
/* We need the rb to map pages. */
rb = event->rb;
goto unlock;
}
/*
* Raced against perf_mmap_close()'s
* atomic_dec_and_mutex_lock() remove the
* event and continue as if !event->rb
*/
ring_buffer_attach(event, NULL);
}
} else {
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
* mapped, all subsequent mappings should have the same size
* and offset. Must be above the normal perf buffer.
*/
u64 aux_offset, aux_size;
rb = event->rb;
if (!rb)
goto aux_unlock;
aux_mutex = &rb->aux_mutex;
mutex_lock(aux_mutex);
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
goto aux_unlock;
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
goto aux_unlock;
/* already mapped with a different offset */
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
goto aux_unlock;
if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
goto aux_unlock;
/* already mapped with a different size */
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
goto aux_unlock;
if (!is_power_of_2(nr_pages))
goto aux_unlock;
if (!atomic_inc_not_zero(&rb->mmap_count))
goto aux_unlock;
if (rb_has_aux(rb)) {
atomic_inc(&rb->aux_mmap_count);
ret = 0;
goto unlock;
}
if (vma->vm_pgoff == 0)
ret = perf_mmap_rb(vma, event, nr_pages);
else
ret = perf_mmap_aux(vma, event, nr_pages);
if (ret)
return ret;
}
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
* Increase the limit linearly with more CPUs:
*/
user_lock_limit *= num_online_cpus();
user_locked = atomic_long_read(&user->locked_vm);
/*
* sysctl_perf_event_mlock may have changed, so that
* user->locked_vm > user_lock_limit
*/
if (user_locked > user_lock_limit)
user_locked = user_lock_limit;
user_locked += user_extra;
if (user_locked > user_lock_limit) {
/*
* charge locked_vm until it hits user_lock_limit;
* charge the rest from pinned_vm
*/
extra = user_locked - user_lock_limit;
user_extra -= extra;
}
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
}
WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;
if (!rb) {
rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);
if (!rb) {
ret = -ENOMEM;
goto unlock;
}
atomic_set(&rb->mmap_count, 1);
rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
ret = 0;
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
if (!ret) {
atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
}
}
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
atomic64_add(extra, &vma->vm_mm->pinned_vm);
atomic_inc(&event->mmap_count);
} else if (rb) {
/* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock:
if (aux_mutex)
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
if (ret)
return ret;
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@@ -7174,7 +7186,7 @@ aux_unlock:
* full cleanup in this case and therefore does not invoke
* vmops::close().
*/
ret = map_range(rb, vma);
ret = map_range(event->rb, vma);
if (ret)
perf_mmap_close(vma);
@@ -7440,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
} else if (!(current->flags & PF_KTHREAD)) {
} else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8080,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
struct page *p;
pagefault_disable();
@@ -8192,7 +8204,8 @@ struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
bool user = !event->attr.exclude_callchain_user;
bool user = !event->attr.exclude_callchain_user &&
!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;
@@ -8204,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
callchain = get_perf_callchain(regs, 0, kernel, user,
callchain = get_perf_callchain(regs, kernel, user,
max_stack, crosstask, true);
return callchain ?: &__empty_callchain;
}
@@ -13249,7 +13262,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
if (refcount_read(&event->mmap_count))
goto unlock;
if (output_event) {
@@ -13262,7 +13275,7 @@ set:
goto unlock;
/* did we race against perf_mmap_close() */
if (!atomic_read(&rb->mmap_count)) {
if (!refcount_read(&rb->mmap_count)) {
ring_buffer_put(rb);
goto unlock;
}

View File

@@ -35,7 +35,7 @@ struct perf_buffer {
spinlock_t event_lock;
struct list_head event_list;
atomic_t mmap_count;
refcount_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
@@ -47,7 +47,7 @@ struct perf_buffer {
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
atomic_t aux_mmap_count;
refcount_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;

View File

@@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* the same order, see perf_mmap_close. Otherwise we end up freeing
* aux pages in this path, which is a bug, because in_atomic().
*/
if (!atomic_read(&rb->aux_mmap_count))
if (!refcount_read(&rb->aux_mmap_count))
goto err;
if (!refcount_inc_not_zero(&rb->aux_refcount))

View File

@@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}
static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
@@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
int nbytes, void *data)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) {
if (is_swbp_insn(insn)) {
if (is_swbp) /* register: already installed? */
return 0;
} else {
@@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
return identical;
}
static int __uprobe_write_opcode(struct vm_area_struct *vma,
static int __uprobe_write(struct vm_area_struct *vma,
struct folio_walk *fw, struct folio *folio,
unsigned long opcode_vaddr, uprobe_opcode_t opcode)
unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
bool is_register)
{
const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
const bool is_register = !!is_swbp_insn(&opcode);
const unsigned long vaddr = insn_vaddr & PAGE_MASK;
bool pmd_mappable;
/* For now, we'll only handle PTE-mapped folios. */
@@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
*/
flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
copy_to_page(fw->page, insn_vaddr, insn, nbytes);
/*
* When unregistering, we may only zap a PTE if uffd is disabled and
@@ -482,23 +483,32 @@ remap:
* @opcode_vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @opcode_vaddr.
*
* Called with mm->mmap_lock held for read or write.
* Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
bool is_register)
{
const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
}
int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
void *data)
{
const unsigned long vaddr = insn_vaddr & PAGE_MASK;
struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
int ret, is_register, ref_ctr_updated = 0;
int ret, ref_ctr_updated = 0;
unsigned int gup_flags = FOLL_FORCE;
struct mmu_notifier_range range;
struct folio_walk fw;
struct folio *folio;
struct page *page;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
@@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* page that we can safely modify. Use FOLL_WRITE to trigger a write
* fault if required. When unregistering, we might be lucky and the
* anon page is already gone. So defer write faults until really
* required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
* required. Use FOLL_SPLIT_PMD, because __uprobe_write()
* cannot deal with PMDs yet.
*/
if (is_register)
@@ -521,14 +531,14 @@ retry:
goto out;
folio = page_folio(page);
ret = verify_opcode(page, opcode_vaddr, &opcode);
ret = verify(page, insn_vaddr, insn, nbytes, data);
if (ret <= 0) {
folio_put(folio);
goto out;
}
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
if (ret) {
folio_put(folio);
@@ -560,7 +570,7 @@ retry:
/* Walk the page tables again, to perform the actual update. */
if (folio_walk_start(&fw, vma, vaddr, 0)) {
if (fw.page == page)
ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
folio_walk_end(&fw, vma);
}
@@ -580,7 +590,7 @@ retry:
out:
/* Revert back reference counter if instruction update failed. */
if (ret < 0 && ref_ctr_updated)
if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
@@ -602,7 +612,7 @@ out:
int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
}
/**
@@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe,
struct vm_area_struct *vma, unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
*(uprobe_opcode_t *)&auprobe->insn, false);
}
/* uprobe should have guaranteed positive refcount */
@@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
if (IS_ERR(page))
return PTR_ERR(page);
copy_from_page(page, offset, insn, nbytes);
uprobe_copy_from_page(page, offset, insn, nbytes);
put_page(page);
return 0;
@@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
GFP_NOWAIT | __GFP_NOMEMALLOC);
if (prev)
prev->next = NULL;
}
@@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode,
return ERR_PTR(-EINVAL);
/*
* This ensures that copy_from_page(), copy_to_page() and
* This ensures that uprobe_copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
@@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, vma, vaddr);
}
mmap_read_unlock(mm);
mmap_write_unlock(mm);
return err;
}
@@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
return ret;
}
void * __weak arch_uprobe_trampoline(unsigned long *psize)
void * __weak arch_uretprobe_trampoline(unsigned long *psize)
{
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
@@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
insns = arch_uprobe_trampoline(&insns_size);
insns = arch_uretprobe_trampoline(&insns_size);
arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
@@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void)
return area;
}
void __weak arch_uprobe_clear_state(struct mm_struct *mm)
{
}
void __weak arch_uprobe_init_state(struct mm_struct *mm)
{
}
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
@@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm)
delayed_uprobe_remove(NULL, mm);
mutex_unlock(&delayed_uprobe_lock);
arch_uprobe_clear_state(mm);
if (!area)
return;
@@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
@@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
return true;
}
void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
{
}
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2741,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
/* Try to optimize after first hit. */
arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
@@ -2752,6 +2779,23 @@ out:
rcu_read_unlock_trace();
}
void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
{
struct uprobe *uprobe;
int is_swbp;
guard(rcu_tasks_trace)();
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe)
return;
if (!get_utask())
return;
if (arch_uprobe_ignore(&uprobe->arch, regs))
return;
handler_chain(uprobe, regs);
}
/*
* Perform required fix-ups and disable singlestep.
* Allow pending signals to take effect.

View File

@@ -1014,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm->uprobes_state.xol_area = NULL;
arch_uprobe_init_state(mm);
#endif
}

View File

@@ -741,6 +741,26 @@ out:
}
#ifdef SECCOMP_ARCH_NATIVE
static bool seccomp_uprobe_exception(struct seccomp_data *sd)
{
#if defined __NR_uretprobe || defined __NR_uprobe
#ifdef SECCOMP_ARCH_COMPAT
if (sd->arch == SECCOMP_ARCH_NATIVE)
#endif
{
#ifdef __NR_uretprobe
if (sd->nr == __NR_uretprobe)
return true;
#endif
#ifdef __NR_uprobe
if (sd->nr == __NR_uprobe)
return true;
#endif
}
#endif
return false;
}
/**
* seccomp_is_const_allow - check if filter is constant allow with given data
* @fprog: The BPF programs
@@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
return false;
/* Our single exception to filtering. */
#ifdef __NR_uretprobe
#ifdef SECCOMP_ARCH_COMPAT
if (sd->arch == SECCOMP_ARCH_NATIVE)
#endif
if (sd->nr == __NR_uretprobe)
return true;
#endif
if (seccomp_uprobe_exception(sd))
return true;
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
@@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
#ifdef __NR_uretprobe
__NR_uretprobe,
#endif
#ifdef __NR_uprobe
__NR_uprobe,
#endif
-1, /* negative terminated */
};

View File

@@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
COND_SYSCALL(rseq);
COND_SYSCALL(uretprobe);
COND_SYSCALL(uprobe);