From 63c49efc987afefc6b9bb7de083eb8748e0b1789 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:17 -0800 Subject: [PATCH 01/36] selftests/bpf: Add simple strscpy() implementation Replace bpf_strlcpy() in bpf_util.h with a sized_strscpy(), which is a simplified sized_strscpy() from the kernel (lib/string.c [1]). It: * takes a count (destination size) parameter * guarantees NULL-termination * returns the number of characters copied or -E2BIG Re-define strscpy macro similar to in-kernel implementation [2]: allow the count parameter to be optional. Add #ifdef-s to tools/include/linux/args.h, as they may be defined in other system headers (for example, __CONCAT in sys/cdefs.h). Fixup the single existing bpf_strlcpy() call in cgroup_helpers.c [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/string.c?h=v6.19#n113 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/string.h?h=v6.19#n91 Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/include/linux/args.h | 4 ++ tools/testing/selftests/bpf/bpf_util.h | 45 ++++++++++++++------ tools/testing/selftests/bpf/cgroup_helpers.c | 2 +- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/tools/include/linux/args.h b/tools/include/linux/args.h index 2e8e65d975c7..14b268f2389a 100644 --- a/tools/include/linux/args.h +++ b/tools/include/linux/args.h @@ -22,7 +22,11 @@ #define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) /* Concatenate two parameters, but allow them to be expanded beforehand. */ +#ifndef __CONCAT #define __CONCAT(a, b) a ## b +#endif +#ifndef CONCATENATE #define CONCATENATE(a, b) __CONCAT(a, b) +#endif #endif /* _LINUX_ARGS_H */ diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 4bc2d25f33e1..6cb56501a505 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -8,6 +8,7 @@ #include #include #include /* libbpf_num_possible_cpus */ +#include static inline unsigned int bpf_num_possible_cpus(void) { @@ -21,25 +22,43 @@ static inline unsigned int bpf_num_possible_cpus(void) return possible_cpus; } -/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst - * is zero-terminated string no matter what (unless sz == 0, in which case - * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs - * in what is returned. Given this is internal helper, it's trivial to extend - * this, when necessary. Use this instead of strncpy inside libbpf source code. +/* + * Simplified strscpy() implementation. The kernel one is in lib/string.c */ -static inline void bpf_strlcpy(char *dst, const char *src, size_t sz) +static inline ssize_t sized_strscpy(char *dest, const char *src, size_t count) { - size_t i; + long res = 0; - if (sz == 0) - return; + if (count == 0) + return -E2BIG; - sz--; - for (i = 0; i < sz && src[i]; i++) - dst[i] = src[i]; - dst[i] = '\0'; + while (count > 1) { + char c; + + c = src[res]; + dest[res] = c; + if (!c) + return res; + res++; + count--; + } + + /* Force NUL-termination. */ + dest[res] = '\0'; + + /* Return E2BIG if the source didn't stop */ + return src[res] ? -E2BIG : res; } +#define __strscpy0(dst, src, ...) \ + sized_strscpy(dst, src, sizeof(dst)) +#define __strscpy1(dst, src, size) \ + sized_strscpy(dst, src, size) + +#undef strscpy /* Redefine the placeholder from tools/include/linux/string.h */ +#define strscpy(dst, src, ...) \ + CONCATENATE(__strscpy, COUNT_ARGS(__VA_ARGS__))(dst, src, __VA_ARGS__) + #define __bpf_percpu_val_align __attribute__((__aligned__(8))) #define BPF_DECLARE_PERCPU(type, name) \ diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 20cede4db3ce..45cd0b479fe3 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -86,7 +86,7 @@ static int __enable_controllers(const char *cgroup_path, const char *controllers enable[len] = 0; close(fd); } else { - bpf_strlcpy(enable, controllers, sizeof(enable)); + strscpy(enable, controllers); } snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path); From 6a087ac23f5b6619033ad08e60c355b00bb8ce51 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:18 -0800 Subject: [PATCH 02/36] selftests/bpf: Replace strcpy() calls with strscpy() strcpy() does not perform bounds checking and is considered deprecated [1]. Replace strcpy() calls with strscpy() defined in bpf_util.h. [1] https://docs.kernel.org/process/deprecated.html#strcpy Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-3-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/network_helpers.c | 2 +- tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 2 +- tools/testing/selftests/bpf/prog_tests/setget_sockopt.c | 2 +- tools/testing/selftests/bpf/prog_tests/sockopt_sk.c | 2 +- tools/testing/selftests/bpf/prog_tests/test_veristat.c | 4 ++-- tools/testing/selftests/bpf/xdp_features.c | 3 ++- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 0a6a5561bed3..5374b7e16d53 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -432,7 +432,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port, memset(addr, 0, sizeof(*sun)); sun->sun_family = family; sun->sun_path[0] = 0; - strcpy(sun->sun_path + 1, addr_str); + strscpy(sun->sun_path + 1, addr_str, sizeof(sun->sun_path) - 1); if (len) *len = offsetof(struct sockaddr_un, sun_path) + 1 + strlen(addr_str); return 0; diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index b7d1b52309d0..f829b6f09bc9 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -281,7 +281,7 @@ static void test_dctcp_fallback(void) dctcp_skel = bpf_dctcp__open(); if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel")) return; - strcpy(dctcp_skel->rodata->fallback_cc, "cubic"); + strscpy(dctcp_skel->rodata->fallback_cc, "cubic"); if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load")) goto done; diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c index e4dac529d424..77fe1bfb7504 100644 --- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -212,7 +212,7 @@ void test_setget_sockopt(void) if (!ASSERT_OK_PTR(skel, "open skel")) goto done; - strcpy(skel->rodata->veth, "binddevtest1"); + strscpy(skel->rodata->veth, "binddevtest1"); skel->rodata->veth_ifindex = if_nametoindex("binddevtest1"); if (!ASSERT_GT(skel->rodata->veth_ifindex, 0, "if_nametoindex")) goto done; diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index ba6b3ec1156a..53637431ec5d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -142,7 +142,7 @@ static int getsetsockopt(void) /* TCP_CONGESTION can extend the string */ - strcpy(buf.cc, "nv"); + strscpy(buf.cc, "nv"); err = setsockopt(fd, SOL_TCP, TCP_CONGESTION, &buf, strlen("nv")); if (err) { log_err("Failed to call setsockopt(TCP_CONGESTION)"); diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index b38c16b4247f..9aff08ac55c0 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -24,9 +24,9 @@ static struct fixture *init_fixture(void) /* for no_alu32 and cpuv4 veristat is in parent folder */ if (access("./veristat", F_OK) == 0) - strcpy(fix->veristat, "./veristat"); + strscpy(fix->veristat, "./veristat"); else if (access("../veristat", F_OK) == 0) - strcpy(fix->veristat, "../veristat"); + strscpy(fix->veristat, "../veristat"); else PRINT_FAIL("Can't find veristat binary"); diff --git a/tools/testing/selftests/bpf/xdp_features.c b/tools/testing/selftests/bpf/xdp_features.c index 595c79141cf3..a27ed663967c 100644 --- a/tools/testing/selftests/bpf/xdp_features.c +++ b/tools/testing/selftests/bpf/xdp_features.c @@ -16,6 +16,7 @@ #include +#include "bpf_util.h" #include "xdp_features.skel.h" #include "xdp_features.h" @@ -212,7 +213,7 @@ static void set_env_default(void) env.feature.drv_feature = NETDEV_XDP_ACT_NDO_XMIT; env.feature.action = -EINVAL; env.ifindex = -ENODEV; - strcpy(env.ifname, "unknown"); + strscpy(env.ifname, "unknown"); make_sockaddr(AF_INET6, "::ffff:127.0.0.1", DUT_CTRL_PORT, &env.dut_ctrl_addr, NULL); make_sockaddr(AF_INET6, "::ffff:127.0.0.1", DUT_ECHO_PORT, From 43277d8f57b490550d33f6f4cb73c28ec6024696 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:19 -0800 Subject: [PATCH 03/36] selftests/bpf: Replace strncpy() with strscpy() strncpy() does not guarantee NULL-termination and is considered deprecated [1]. Replace strncpy() calls with strscpy(). [1] https://docs.kernel.org/process/deprecated.html#strncpy-on-nul-terminated-strings Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-4-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/network_helpers.c | 3 +-- tools/testing/selftests/bpf/prog_tests/bpf_iter.c | 3 +-- tools/testing/selftests/bpf/prog_tests/flow_dissector.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/queue_stack_map.c | 4 ++-- tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c | 2 +- tools/testing/selftests/bpf/prog_tests/task_local_data.h | 2 +- tools/testing/selftests/bpf/prog_tests/tc_redirect.c | 2 +- tools/testing/selftests/bpf/test_progs.c | 2 +- tools/testing/selftests/bpf/xdp_hw_metadata.c | 4 ++-- 9 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 5374b7e16d53..b82f572641b7 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -581,8 +581,7 @@ int open_tuntap(const char *dev_name, bool need_mac) return -1; ifr.ifr_flags = IFF_NO_PI | (need_mac ? IFF_TAP : IFF_TUN); - strncpy(ifr.ifr_name, dev_name, IFNAMSIZ - 1); - ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + strscpy(ifr.ifr_name, dev_name); err = ioctl(fd, TUNSETIFF, &ifr); if (!ASSERT_OK(err, "ioctl(TUNSETIFF)")) { diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c index 5225d69bf79b..c69080ca14f5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c @@ -346,8 +346,7 @@ static void test_task_sleepable(void) close(finish_pipe[1]); test_data = malloc(sizeof(char) * 10); - strncpy(test_data, "test_data", 10); - test_data[9] = '\0'; + strscpy(test_data, "test_data", 10); test_data_long = malloc(sizeof(char) * 5000); for (int i = 0; i < 5000; ++i) { diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c index 08bae13248c4..fb4892681464 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c @@ -570,7 +570,7 @@ static int create_tap(const char *ifname) }; int fd, ret; - strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + strscpy(ifr.ifr_name, ifname); fd = open("/dev/net/tun", O_RDWR); if (fd < 0) @@ -599,7 +599,7 @@ static int ifup(const char *ifname) struct ifreq ifr = {}; int sk, ret; - strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)); + strscpy(ifr.ifr_name, ifname); sk = socket(PF_INET, SOCK_DGRAM, 0); if (sk < 0) diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c index a043af9cd6d9..41441325e179 100644 --- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c +++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c @@ -28,9 +28,9 @@ static void test_queue_stack_map_by_type(int type) vals[i] = rand(); if (type == QUEUE) - strncpy(file, "./test_queue_map.bpf.o", sizeof(file)); + strscpy(file, "./test_queue_map.bpf.o"); else if (type == STACK) - strncpy(file, "./test_stack_map.bpf.o", sizeof(file)); + strscpy(file, "./test_stack_map.bpf.o"); else return; diff --git a/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c b/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c index 3eefdfed1db9..657d897958b6 100644 --- a/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c +++ b/tools/testing/selftests/bpf/prog_tests/skc_to_unix_sock.c @@ -34,7 +34,7 @@ void test_skc_to_unix_sock(void) memset(&sockaddr, 0, sizeof(sockaddr)); sockaddr.sun_family = AF_UNIX; - strncpy(sockaddr.sun_path, sock_path, strlen(sock_path)); + strscpy(sockaddr.sun_path, sock_path); sockaddr.sun_path[0] = '\0'; err = bind(sockfd, (struct sockaddr *)&sockaddr, sizeof(sockaddr)); diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 0f86b9275cf9..8342e2fe5260 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -262,7 +262,7 @@ retry: if (!atomic_compare_exchange_strong(&tld_meta_p->cnt, &cnt, cnt + 1)) goto retry; - strncpy(tld_meta_p->metadata[i].name, name, TLD_NAME_LEN); + strscpy(tld_meta_p->metadata[i].name, name); atomic_store(&tld_meta_p->metadata[i].size, size); return (tld_key_t){(__s16)off}; } diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 76d72a59365e..64fbda082309 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -1095,7 +1095,7 @@ static int tun_open(char *name) ifr.ifr_flags = IFF_TUN | IFF_NO_PI; if (*name) - strncpy(ifr.ifr_name, name, IFNAMSIZ); + strscpy(ifr.ifr_name, name); err = ioctl(fd, TUNSETIFF, &ifr); if (!ASSERT_OK(err, "ioctl TUNSETIFF")) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 02a85dda30e6..d1418ec1f351 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1799,7 +1799,7 @@ static int worker_main_send_subtests(int sock, struct test_state *state) msg.subtest_done.num = i; - strncpy(msg.subtest_done.name, subtest_state->name, MAX_SUBTEST_NAME); + strscpy(msg.subtest_done.name, subtest_state->name, MAX_SUBTEST_NAME); msg.subtest_done.error_cnt = subtest_state->error_cnt; msg.subtest_done.skipped = subtest_state->skipped; diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 3d8de0d4c96a..6db3b5555a22 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -550,7 +550,7 @@ static int rxq_num(const char *ifname) struct ifreq ifr = { .ifr_data = (void *)&ch, }; - strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1); + strscpy(ifr.ifr_name, ifname); int fd, ret; fd = socket(AF_UNIX, SOCK_DGRAM, 0); @@ -571,7 +571,7 @@ static void hwtstamp_ioctl(int op, const char *ifname, struct hwtstamp_config *c struct ifreq ifr = { .ifr_data = (void *)cfg, }; - strncpy(ifr.ifr_name, ifname, IF_NAMESIZE - 1); + strscpy(ifr.ifr_name, ifname); int fd, ret; fd = socket(AF_UNIX, SOCK_DGRAM, 0); From 3ed0bc2d4994c718fb5476b45d4aafec42a1d040 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:20 -0800 Subject: [PATCH 04/36] selftests/bpf: Use strscpy in bpftool_helpers.c Replace strncpy() calls in bpftool_helpers.c with strscpy(). Pass the destination buffer size to detect_bpftool_path() instead of hardcoding BPFTOOL_PATH_MAX_LEN. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-5-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpftool_helpers.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c index a5824945a4a5..595a636fa13b 100644 --- a/tools/testing/selftests/bpf/bpftool_helpers.c +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -1,15 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only -#include "bpftool_helpers.h" #include #include #include +#include "bpf_util.h" +#include "bpftool_helpers.h" + #define BPFTOOL_PATH_MAX_LEN 64 #define BPFTOOL_FULL_CMD_MAX_LEN 512 #define BPFTOOL_DEFAULT_PATH "tools/sbin/bpftool" -static int detect_bpftool_path(char *buffer) +static int detect_bpftool_path(char *buffer, size_t size) { char tmp[BPFTOOL_PATH_MAX_LEN]; @@ -18,7 +20,7 @@ static int detect_bpftool_path(char *buffer) */ snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "./%s", BPFTOOL_DEFAULT_PATH); if (access(tmp, X_OK) == 0) { - strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + strscpy(buffer, tmp, size); return 0; } @@ -27,7 +29,7 @@ static int detect_bpftool_path(char *buffer) */ snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "../%s", BPFTOOL_DEFAULT_PATH); if (access(tmp, X_OK) == 0) { - strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + strscpy(buffer, tmp, size); return 0; } @@ -44,7 +46,7 @@ static int run_command(char *args, char *output_buf, size_t output_max_len) int ret; /* Detect and cache bpftool binary location */ - if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path)) + if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path, sizeof(bpftool_path))) return 1; ret = snprintf(command, BPFTOOL_FULL_CMD_MAX_LEN, "%s %s%s", From 9d8685239e85ba5a4899b5b3534c732e3ae5f2aa Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:21 -0800 Subject: [PATCH 05/36] selftests/bpf: Use memcpy() for bounded non-NULL-terminated copies Replace strncpy() with memcpy() in cases where the source is non-NULL-terminated and the copy length is known. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-6-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c | 6 ++++-- tools/testing/selftests/bpf/test_verifier.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index dd75ccb03770..469e92869523 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -308,8 +308,10 @@ static int find_field_offset(struct btf *btf, char *pattern, regmatch_t *matches return -1; } - strncpy(type_str, type, type_sz); - strncpy(field_str, field, field_sz); + memcpy(type_str, type, type_sz); + type_str[type_sz] = '\0'; + memcpy(field_str, field, field_sz); + field_str[field_sz] = '\0'; btf_id = btf__find_by_name(btf, type_str); if (btf_id < 0) { PRINT_FAIL("No BTF info for type %s\n", type_str); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 27db34ecf3f5..a8ae03c57bba 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1320,7 +1320,7 @@ static bool cmp_str_seq(const char *log, const char *exp) printf("FAIL\nTestcase bug\n"); return false; } - strncpy(needle, exp, len); + memcpy(needle, exp, len); needle[len] = 0; q = strstr(log, needle); if (!q) { From 4021848a903e65f74cf88997c12b96d6bc2453e6 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:22 -0800 Subject: [PATCH 06/36] selftests/bpf: Pass through build flags to bpftool and resolve_btfids EXTRA_* and SAN_* build flags were not correctly propagated to bpftool and resolve_btids when building selftests/bpf. This led to various build errors on attempt to build with SAN_CFLAGS="-fsanitize=address", for example. Fix the makefiles to address this: - Pass SAN_CFLAGS/SAN_LDFLAGS to bpftool and resolve_btfids build - Propagate EXTRA_LDFLAGS to resolve_btfids link command - Use pkg-config to detect zlib and zstd for resolve_btfids, similar libelf handling Also check for ASAN flag in selftests/bpf/Makefile for convenience. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-7-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/Makefile | 7 +++++-- tools/testing/selftests/bpf/Makefile | 13 +++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 1733a6e93a07..ef083602b73a 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -65,6 +65,9 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU LIBELF_FLAGS := $(shell $(HOSTPKG_CONFIG) libelf --cflags 2>/dev/null) LIBELF_LIBS := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lelf) +ZLIB_LIBS := $(shell $(HOSTPKG_CONFIG) zlib --libs 2>/dev/null || echo -lz) +ZSTD_LIBS := $(shell $(HOSTPKG_CONFIG) libzstd --libs 2>/dev/null || echo -lzstd) + HOSTCFLAGS_resolve_btfids += -g \ -I$(srctree)/tools/include \ -I$(srctree)/tools/include/uapi \ @@ -73,7 +76,7 @@ HOSTCFLAGS_resolve_btfids += -g \ $(LIBELF_FLAGS) \ -Wall -Werror -LIBS = $(LIBELF_LIBS) -lz +LIBS = $(LIBELF_LIBS) $(ZLIB_LIBS) $(ZSTD_LIBS) export srctree OUTPUT HOSTCFLAGS_resolve_btfids Q HOSTCC HOSTLD HOSTAR include $(srctree)/tools/build/Makefile.include @@ -83,7 +86,7 @@ $(BINARY_IN): fixdep FORCE prepare | $(OUTPUT) $(BINARY): $(BPFOBJ) $(SUBCMDOBJ) $(BINARY_IN) $(call msg,LINK,$@) - $(Q)$(HOSTCC) $(BINARY_IN) $(KBUILD_HOSTLDFLAGS) -o $@ $(BPFOBJ) $(SUBCMDOBJ) $(LIBS) + $(Q)$(HOSTCC) $(BINARY_IN) $(KBUILD_HOSTLDFLAGS) $(EXTRA_LDFLAGS) -o $@ $(BPFOBJ) $(SUBCMDOBJ) $(LIBS) clean_objects := $(wildcard $(OUTPUT)/*.o \ $(OUTPUT)/.*.o.cmd \ diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6776158f1f3e..72a9ba41f95e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -27,7 +27,11 @@ ifneq ($(wildcard $(GENHDR)),) endif BPF_GCC ?= $(shell command -v bpf-gcc;) +ifdef ASAN +SAN_CFLAGS ?= -fsanitize=address -fno-omit-frame-pointer +else SAN_CFLAGS ?= +endif SAN_LDFLAGS ?= $(SAN_CFLAGS) RELEASE ?= OPT_FLAGS ?= $(if $(RELEASE),-O2,-O0) @@ -326,8 +330,8 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" \ - EXTRA_CFLAGS='-g $(OPT_FLAGS) $(EXTRA_CFLAGS)' \ - EXTRA_LDFLAGS='$(EXTRA_LDFLAGS)' \ + EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \ + EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \ LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/ \ @@ -338,8 +342,8 @@ $(CROSS_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(BPFOBJ) | $(BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) \ - EXTRA_CFLAGS='-g $(OPT_FLAGS) $(EXTRA_CFLAGS)' \ - EXTRA_LDFLAGS='$(EXTRA_LDFLAGS)' \ + EXTRA_CFLAGS='-g $(OPT_FLAGS) $(SAN_CFLAGS) $(EXTRA_CFLAGS)' \ + EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' \ OUTPUT=$(BUILD_DIR)/bpftool/ \ LIBBPF_OUTPUT=$(BUILD_DIR)/libbpf/ \ LIBBPF_DESTDIR=$(SCRATCH_DIR)/ \ @@ -404,6 +408,7 @@ $(RESOLVE_BTFIDS): $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/resolve_btfids \ $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/resolve_btfids \ CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" \ LIBBPF_INCLUDE=$(HOST_INCLUDE_DIR) \ + EXTRA_LDFLAGS='$(SAN_LDFLAGS) $(EXTRA_LDFLAGS)' \ OUTPUT=$(HOST_BUILD_DIR)/resolve_btfids/ BPFOBJ=$(HOST_BPFOBJ) # Get Clang's default includes on this system, as opposed to those seen by From c5c1e313493cd836863bb673bb2b8beaa915cad9 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:23 -0800 Subject: [PATCH 07/36] resolve_btfids: Fix memory leaks reported by ASAN Running resolve_btfids with ASAN reveals memory leaks in btf_id handling. - Change get_id() to use a local buffer - Make btf_id__add() strdup the name internally - Add btf_id__free_all() that frees all nodese of a tree - Call the cleanup function on exit for every tree Acked-by: Jiri Olsa Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-8-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/main.c | 81 ++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index ca7fcd03efb6..5208f650080f 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -226,7 +226,7 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name) } static struct btf_id *__btf_id__add(struct rb_root *root, - char *name, + const char *name, enum btf_id_kind kind, bool unique) { @@ -250,7 +250,11 @@ static struct btf_id *__btf_id__add(struct rb_root *root, id = zalloc(sizeof(*id)); if (id) { pr_debug("adding symbol %s\n", name); - id->name = name; + id->name = strdup(name); + if (!id->name) { + free(id); + return NULL; + } id->kind = kind; rb_link_node(&id->rb_node, parent, p); rb_insert_color(&id->rb_node, root); @@ -258,17 +262,21 @@ static struct btf_id *__btf_id__add(struct rb_root *root, return id; } -static inline struct btf_id *btf_id__add(struct rb_root *root, char *name, enum btf_id_kind kind) +static inline struct btf_id *btf_id__add(struct rb_root *root, + const char *name, + enum btf_id_kind kind) { return __btf_id__add(root, name, kind, false); } -static inline struct btf_id *btf_id__add_unique(struct rb_root *root, char *name, enum btf_id_kind kind) +static inline struct btf_id *btf_id__add_unique(struct rb_root *root, + const char *name, + enum btf_id_kind kind) { return __btf_id__add(root, name, kind, true); } -static char *get_id(const char *prefix_end) +static int get_id(const char *prefix_end, char *buf, size_t buf_sz) { /* * __BTF_ID__func__vfs_truncate__0 @@ -277,28 +285,28 @@ static char *get_id(const char *prefix_end) */ int len = strlen(prefix_end); int pos = sizeof("__") - 1; - char *p, *id; + char *p; if (pos >= len) - return NULL; + return -1; - id = strdup(prefix_end + pos); - if (id) { - /* - * __BTF_ID__func__vfs_truncate__0 - * id = ^ - * - * cut the unique id part - */ - p = strrchr(id, '_'); - p--; - if (*p != '_') { - free(id); - return NULL; - } - *p = '\0'; - } - return id; + if (len - pos >= buf_sz) + return -1; + + strcpy(buf, prefix_end + pos); + /* + * __BTF_ID__func__vfs_truncate__0 + * buf = ^ + * + * cut the unique id part + */ + p = strrchr(buf, '_'); + p--; + if (*p != '_') + return -1; + *p = '\0'; + + return 0; } static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind kind) @@ -335,10 +343,9 @@ static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind k static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) { - char *id; + char id[KSYM_NAME_LEN]; - id = get_id(name + size); - if (!id) { + if (get_id(name + size, id, sizeof(id))) { pr_err("FAILED to parse symbol name: %s\n", name); return NULL; } @@ -346,6 +353,21 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return btf_id__add(root, id, BTF_ID_KIND_SYM); } +static void btf_id__free_all(struct rb_root *root) +{ + struct rb_node *next; + struct btf_id *id; + + next = rb_first(root); + while (next) { + id = rb_entry(next, struct btf_id, rb_node); + next = rb_next(&id->rb_node); + rb_erase(&id->rb_node, root); + free(id->name); + free(id); + } +} + static void bswap_32_data(void *data, u32 nr_bytes) { u32 cnt, i; @@ -1547,6 +1569,11 @@ dump_btf: out: btf__free(obj.base_btf); btf__free(obj.btf); + btf_id__free_all(&obj.structs); + btf_id__free_all(&obj.unions); + btf_id__free_all(&obj.typedefs); + btf_id__free_all(&obj.funcs); + btf_id__free_all(&obj.sets); if (obj.efile.elf) { elf_end(obj.efile.elf); close(obj.efile.fd); From 45897ced3c1d3940fb5b68fe48c9e144143ae0ae Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:24 -0800 Subject: [PATCH 08/36] selftests/bpf: Add DENYLIST.asan Add a denylist file for tests that should be skipped when built with userspace ASAN: $ make ... SAN_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" Skip the following tests: - *arena*: userspace ASAN does not understand BPF arena maps and gets confused particularly when map_extra is non-zero - non-zero map_extra leads to mmap with MAP_FIXED, and ASAN treats this as an unknown memory region - task_local_data: ASAN complains about "incorrect" aligned_alloc() usage, but it's intentional in the test - uprobe_multi_test: very slow with ASAN enabled Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-9-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.asan | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/bpf/DENYLIST.asan diff --git a/tools/testing/selftests/bpf/DENYLIST.asan b/tools/testing/selftests/bpf/DENYLIST.asan new file mode 100644 index 000000000000..d7fe372a2293 --- /dev/null +++ b/tools/testing/selftests/bpf/DENYLIST.asan @@ -0,0 +1,3 @@ +*arena* +task_local_data +uprobe_multi_test From a1a771bd649212ef32cf9b0bcc63213a762d354a Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:25 -0800 Subject: [PATCH 09/36] selftests/bpf: Refactor bpf_get_ksyms() trace helper ASAN reported a memory leak in bpf_get_ksyms(): it allocates a struct ksyms internally and never frees it. Move struct ksyms to trace_helpers.h and return it from the bpf_get_ksyms(), giving ownership to the caller. Add filtered_syms and filtered_cnt fields to the ksyms to hold the filtered array of symbols, previously returned by bpf_get_ksyms(). Fixup the call sites: kprobe_multi_test and bench_trigger. Signed-off-by: Ihor Solodrai Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260223190736.649171-10-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/benchs/bench_trigger.c | 14 ++++++----- .../bpf/prog_tests/kprobe_multi_test.c | 12 ++++------ tools/testing/selftests/bpf/trace_helpers.c | 23 ++++++++++--------- tools/testing/selftests/bpf/trace_helpers.h | 11 +++++++-- 4 files changed, 34 insertions(+), 26 deletions(-) diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index aeec9edd3851..f74b313d6ae4 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -230,8 +230,8 @@ static void trigger_fentry_setup(void) static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); - char **syms = NULL; - size_t cnt = 0; + struct bpf_link *link = NULL; + struct ksyms *ksyms = NULL; /* Some recursive functions will be skipped in * bpf_get_ksyms -> skip_entry, as they can introduce sufficient @@ -241,16 +241,18 @@ static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe) * So, don't run the kprobe-multi-all and kretprobe-multi-all on * a debug kernel. */ - if (bpf_get_ksyms(&syms, &cnt, true)) { + if (bpf_get_ksyms(&ksyms, true)) { fprintf(stderr, "failed to get ksyms\n"); exit(1); } - opts.syms = (const char **) syms; - opts.cnt = cnt; + opts.syms = (const char **)ksyms->filtered_syms; + opts.cnt = ksyms->filtered_cnt; opts.retprobe = kretprobe; /* attach empty to all the kernel functions except bpf_get_numa_node_id. */ - if (!bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts)) { + link = bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts); + free_kallsyms_local(ksyms); + if (!link) { fprintf(stderr, "failed to attach bpf_program__attach_kprobe_multi_opts to all\n"); exit(1); } diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 9caef222e528..f81dcd609ee9 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -456,25 +456,23 @@ static void test_kprobe_multi_bench_attach(bool kernel) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); struct kprobe_multi_empty *skel = NULL; - char **syms = NULL; - size_t cnt = 0; + struct ksyms *ksyms = NULL; - if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "bpf_get_ksyms")) + if (!ASSERT_OK(bpf_get_ksyms(&ksyms, kernel), "bpf_get_ksyms")) return; skel = kprobe_multi_empty__open_and_load(); if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) goto cleanup; - opts.syms = (const char **) syms; - opts.cnt = cnt; + opts.syms = (const char **)ksyms->filtered_syms; + opts.cnt = ksyms->filtered_cnt; do_bench_test(skel, &opts); cleanup: kprobe_multi_empty__destroy(skel); - if (syms) - free(syms); + free_kallsyms_local(ksyms); } static void test_kprobe_multi_bench_attach_addr(bool kernel) diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index eeaab7013ca2..0e63daf83ed5 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -24,12 +24,6 @@ #define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" #define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" -struct ksyms { - struct ksym *syms; - size_t sym_cap; - size_t sym_cnt; -}; - static struct ksyms *ksyms; static pthread_mutex_t ksyms_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -54,6 +48,8 @@ void free_kallsyms_local(struct ksyms *ksyms) if (!ksyms) return; + free(ksyms->filtered_syms); + if (!ksyms->syms) { free(ksyms); return; @@ -610,7 +606,7 @@ static int search_kallsyms_compare(const void *p1, const struct ksym *p2) return compare_name(p1, p2->name); } -int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) +int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel) { size_t cap = 0, cnt = 0; char *name = NULL, *ksym_name, **syms = NULL; @@ -637,8 +633,10 @@ int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) else f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); - if (!f) + if (!f) { + free_kallsyms_local(ksyms); return -EINVAL; + } map = hashmap__new(symbol_hash, symbol_equal, NULL); if (IS_ERR(map)) { @@ -679,15 +677,18 @@ int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) syms[cnt++] = ksym_name; } - *symsp = syms; - *cntp = cnt; + ksyms->filtered_syms = syms; + ksyms->filtered_cnt = cnt; + *ksymsp = ksyms; error: free(name); fclose(f); hashmap__free(map); - if (err) + if (err) { free(syms); + free_kallsyms_local(ksyms); + } return err; } diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index a5576b2dfc26..d5bf1433675d 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -23,7 +23,14 @@ struct ksym { long addr; char *name; }; -struct ksyms; + +struct ksyms { + struct ksym *syms; + size_t sym_cap; + size_t sym_cnt; + char **filtered_syms; + size_t filtered_cnt; +}; typedef int (*ksym_cmp_t)(const void *p1, const void *p2); typedef int (*ksym_search_cmp_t)(const void *p1, const struct ksym *p2); @@ -53,7 +60,7 @@ ssize_t get_rel_offset(uintptr_t addr); int read_build_id(const char *path, char *build_id, size_t size); -int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel); +int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel); int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel); #endif From 9d0272c91fbc3ae86f2c3b6ba0a0b0ee77d862e3 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:26 -0800 Subject: [PATCH 10/36] selftests/bpf: Fix memory leaks in tests Fix trivial memory leaks detected by userspace ASAN: - htab_update: free value buffer in test_reenter_update cleanup - test_xsk: inline pkt_stream_replace() in testapp_stats_rx_full() and testapp_stats_fill_empty() - testing_helpers: free buffer allocated by getline() in parse_test_list_file Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-11-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/htab_update.c | 1 + .../selftests/bpf/prog_tests/test_xsk.c | 24 +++++++++++++++---- tools/testing/selftests/bpf/testing_helpers.c | 1 + 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index d0b405eb2966..ea1a6766fbe9 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -61,6 +61,7 @@ static void test_reenter_update(void) ASSERT_EQ(skel->bss->update_err, -EDEADLK, "no reentrancy"); out: + free(value); htab_update__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c index bab4a31621c7..7e38ec6e656b 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -2003,9 +2003,17 @@ int testapp_stats_tx_invalid_descs(struct test_spec *test) int testapp_stats_rx_full(struct test_spec *test) { - if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + struct pkt_stream *tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); + if (!tmp) return TEST_FAILURE; - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + test->ifobj_tx->xsk->pkt_stream = tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + if (!tmp) + return TEST_FAILURE; + test->ifobj_rx->xsk->pkt_stream = tmp; test->ifobj_rx->xsk->rxqsize = DEFAULT_UMEM_BUFFERS; test->ifobj_rx->release_rx = false; @@ -2015,9 +2023,17 @@ int testapp_stats_rx_full(struct test_spec *test) int testapp_stats_fill_empty(struct test_spec *test) { - if (pkt_stream_replace(test, DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE)) + struct pkt_stream *tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS + DEFAULT_UMEM_BUFFERS / 2, MIN_PKT_SIZE); + if (!tmp) return TEST_FAILURE; - test->ifobj_rx->xsk->pkt_stream = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + test->ifobj_tx->xsk->pkt_stream = tmp; + + tmp = pkt_stream_generate(DEFAULT_UMEM_BUFFERS, MIN_PKT_SIZE); + if (!tmp) + return TEST_FAILURE; + test->ifobj_rx->xsk->pkt_stream = tmp; test->ifobj_rx->use_fill_ring = false; test->ifobj_rx->validation_func = validate_fill_empty; diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 16eb37e5bad6..66af0d13751a 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -212,6 +212,7 @@ int parse_test_list_file(const char *path, break; } + free(buf); fclose(f); return err; } From 3eb4a2e399c6766ea0c8291e2adb5e6dc42b6e68 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:27 -0800 Subject: [PATCH 11/36] selftests/bpf: Fix cleanup in check_fd_array_cnt__fd_array_too_big() The Close() macro uses the passed in expression three times, which leads to repeated execution in case it has side effects. That is, Close(i--) would decrement i three times. ASAN caught a stack-buffer-undeflow error at a point where this was overlooked. Fix it. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-12-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/fd_array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c index c534b4d5f9da..3078d8264deb 100644 --- a/tools/testing/selftests/bpf/prog_tests/fd_array.c +++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c @@ -412,8 +412,8 @@ static void check_fd_array_cnt__fd_array_too_big(void) ASSERT_EQ(prog_fd, -E2BIG, "prog should have been rejected with -E2BIG"); cleanup_fds: - while (i > 0) - Close(extra_fds[--i]); + while (i-- > 0) + Close(extra_fds[i]); } void test_fd_array_cnt(void) From ff9511410d5fbdec21a3bbfaa477aa1f56735b33 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:28 -0800 Subject: [PATCH 12/36] veristat: Fix a memory leak for preset ENUMERATOR ASAN detected a memory leak in veristat. The cleanup code handling ENUMERATOR value missed freeing strdup-ed svalue. Fix it. Acked-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-13-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 1be1e353d40a..75f85e0362f5 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -3378,6 +3378,8 @@ int main(int argc, char **argv) } } free(env.presets[i].atoms); + if (env.presets[i].value.type == ENUMERATOR) + free(env.presets[i].value.svalue); } free(env.presets); return -err; From f7ac552be7f13e834a942f64b1ac87e7755b32f4 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:29 -0800 Subject: [PATCH 13/36] selftests/bpf: Fix use-after-free in xdp_metadata test ASAN reported a use-after-free in close_xsk(). The xsk->socket internally references xsk->umem via socket->ctx->umem, so the socket must be deleted before the umem. Fix the order of operations in close_xsk(). Acked-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-14-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_metadata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c index 19f92affc2da..5c31054ad4a4 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_metadata.c @@ -126,10 +126,10 @@ static int open_xsk(int ifindex, struct xsk *xsk) static void close_xsk(struct xsk *xsk) { - if (xsk->umem) - xsk_umem__delete(xsk->umem); if (xsk->socket) xsk_socket__delete(xsk->socket); + if (xsk->umem) + xsk_umem__delete(xsk->umem); munmap(xsk->umem_area, UMEM_SIZE); } From 68b8bea1eec392aa50736a859b8f1418067a3bc4 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:30 -0800 Subject: [PATCH 14/36] selftests/bpf: Fix double thread join in uprobe_multi_test ASAN reported a "joining already joined thread" error. The release_child() may be called multiple times for the same struct child. Fix by resetting child->thread to 0 after pthread_join. Also memset(0) static child variable in test_attach_api(). Acked-by: Mykyta Yatsenko Acked-by: Jiri Olsa Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-15-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 2ee17ef1dae2..56cbea280fbd 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -62,8 +62,10 @@ static void release_child(struct child *child) return; close(child->go[1]); close(child->go[0]); - if (child->thread) + if (child->thread) { pthread_join(child->thread, NULL); + child->thread = 0; + } close(child->c2p[0]); close(child->c2p[1]); if (child->pid > 0) @@ -331,6 +333,8 @@ test_attach_api(const char *binary, const char *pattern, struct bpf_uprobe_multi { static struct child child; + memset(&child, 0, sizeof(child)); + /* no pid filter */ __test_attach_api(binary, pattern, opts, NULL); From 71dca2950fe9abf8e6fd3d9f5e6342da6634b898 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:31 -0800 Subject: [PATCH 15/36] selftests/bpf: Fix resource leaks caused by missing cleanups ASAN reported a number of resource leaks: - Add missing *__destroy(skel) calls - Replace bpf_link__detach() with bpf_link__destroy() where appropriate - cgrp_local_storage: Add bpf_link__destroy() when bpf_iter_create fails - dynptr: Add missing bpf_object__close() Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-16-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/cgrp_local_storage.c | 4 ++- .../testing/selftests/bpf/prog_tests/dynptr.c | 5 +++- .../selftests/bpf/prog_tests/sockmap_basic.c | 28 +++++++++---------- .../selftests/bpf/prog_tests/sockmap_listen.c | 2 +- .../bpf/prog_tests/struct_ops_private_stack.c | 4 +-- .../selftests/bpf/prog_tests/tc_opts.c | 6 ++-- .../selftests/bpf/prog_tests/test_tc_tunnel.c | 5 +++- 7 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c index 9015e2c2ab12..478a77cb67e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c @@ -202,7 +202,7 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) iter_fd = bpf_iter_create(bpf_link__fd(link)); if (!ASSERT_GE(iter_fd, 0, "iter_create")) - goto out; + goto out_link; /* trigger the program run */ (void)read(iter_fd, buf, sizeof(buf)); @@ -210,6 +210,8 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id"); close(iter_fd); +out_link: + bpf_link__destroy(link); out: cgrp_ls_sleepable__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index b9f86cb91e81..5fda11590708 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -137,11 +137,14 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ ); link = bpf_program__attach(prog); - if (!ASSERT_OK_PTR(link, "bpf_program__attach")) + if (!ASSERT_OK_PTR(link, "bpf_program__attach")) { + bpf_object__close(obj); goto cleanup; + } err = bpf_prog_test_run_opts(aux_prog_fd, &topts); bpf_link__destroy(link); + bpf_object__close(obj); if (!ASSERT_OK(err, "test_run")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 256707e7d20d..dd3c757859f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -204,7 +204,7 @@ static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) /* Fail since bpf_link for the same prog type has been created. */ link2 = bpf_program__attach_sockmap(prog_clone, map); if (!ASSERT_ERR_PTR(link2, "bpf_program__attach_sockmap")) { - bpf_link__detach(link2); + bpf_link__destroy(link2); goto out; } @@ -230,7 +230,7 @@ static void test_skmsg_helpers_with_link(enum bpf_map_type map_type) if (!ASSERT_OK(err, "bpf_link_update")) goto out; out: - bpf_link__detach(link); + bpf_link__destroy(link); test_skmsg_load_helpers__destroy(skel); } @@ -417,7 +417,7 @@ static void test_sockmap_skb_verdict_attach_with_link(void) if (!ASSERT_OK_PTR(link, "bpf_program__attach_sockmap")) goto out; - bpf_link__detach(link); + bpf_link__destroy(link); err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); if (!ASSERT_OK(err, "bpf_prog_attach")) @@ -426,7 +426,7 @@ static void test_sockmap_skb_verdict_attach_with_link(void) /* Fail since attaching with the same prog/map has been done. */ link = bpf_program__attach_sockmap(prog, map); if (!ASSERT_ERR_PTR(link, "bpf_program__attach_sockmap")) - bpf_link__detach(link); + bpf_link__destroy(link); err = bpf_prog_detach2(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT); if (!ASSERT_OK(err, "bpf_prog_detach2")) @@ -747,13 +747,13 @@ static void test_sockmap_skb_verdict_peek_with_link(void) test_sockmap_skb_verdict_peek_helper(map); ASSERT_EQ(pass->bss->clone_called, 1, "clone_called"); out: - bpf_link__detach(link); + bpf_link__destroy(link); test_sockmap_pass_prog__destroy(pass); } static void test_sockmap_unconnected_unix(void) { - int err, map, stream = 0, dgram = 0, zero = 0; + int err, map, stream = -1, dgram = -1, zero = 0; struct test_sockmap_pass_prog *skel; skel = test_sockmap_pass_prog__open_and_load(); @@ -764,22 +764,22 @@ static void test_sockmap_unconnected_unix(void) stream = xsocket(AF_UNIX, SOCK_STREAM, 0); if (stream < 0) - return; + goto out; dgram = xsocket(AF_UNIX, SOCK_DGRAM, 0); - if (dgram < 0) { - close(stream); - return; - } + if (dgram < 0) + goto out; err = bpf_map_update_elem(map, &zero, &stream, BPF_ANY); - ASSERT_ERR(err, "bpf_map_update_elem(stream)"); + if (!ASSERT_ERR(err, "bpf_map_update_elem(stream)")) + goto out; err = bpf_map_update_elem(map, &zero, &dgram, BPF_ANY); ASSERT_OK(err, "bpf_map_update_elem(dgram)"); - +out: close(stream); close(dgram); + test_sockmap_pass_prog__destroy(skel); } static void test_sockmap_many_socket(void) @@ -1027,7 +1027,7 @@ static void test_sockmap_skb_verdict_vsock_poll(void) if (xrecv_nonblock(conn, &buf, 1, 0) != 1) FAIL("xrecv_nonblock"); detach: - bpf_link__detach(link); + bpf_link__destroy(link); close: xclose(conn); xclose(peer); diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index f1bdccc7e4e7..cc0c68bab907 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -899,7 +899,7 @@ static void test_msg_redir_to_listening_with_link(struct test_sockmap_listen *sk redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS); - bpf_link__detach(link); + bpf_link__destroy(link); } static void redir_partial(int family, int sotype, int sock_map, int parser_map) diff --git a/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c index 4006879ca3fe..d42123a0fb16 100644 --- a/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c +++ b/tools/testing/selftests/bpf/prog_tests/struct_ops_private_stack.c @@ -54,9 +54,7 @@ static void test_private_stack_fail(void) } err = struct_ops_private_stack_fail__load(skel); - if (!ASSERT_ERR(err, "struct_ops_private_stack_fail__load")) - goto cleanup; - return; + ASSERT_ERR(err, "struct_ops_private_stack_fail__load"); cleanup: struct_ops_private_stack_fail__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/tc_opts.c b/tools/testing/selftests/bpf/prog_tests/tc_opts.c index dd7a138d8c3d..2955750ddada 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_opts.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_opts.c @@ -1360,10 +1360,8 @@ static void test_tc_opts_dev_cleanup_target(int target) assert_mprog_count_ifindex(ifindex, target, 4); - ASSERT_OK(system("ip link del dev tcx_opts1"), "del veth"); - ASSERT_EQ(if_nametoindex("tcx_opts1"), 0, "dev1_removed"); - ASSERT_EQ(if_nametoindex("tcx_opts2"), 0, "dev2_removed"); - return; + goto cleanup; + cleanup3: err = bpf_prog_detach_opts(fd3, loopback, target, &optd); ASSERT_OK(err, "prog_detach"); diff --git a/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c index 0fe0a8f62486..7fc4d7dd70ef 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tc_tunnel.c @@ -699,7 +699,7 @@ void test_tc_tunnel(void) return; if (!ASSERT_OK(setup(), "global setup")) - return; + goto out; for (i = 0; i < ARRAY_SIZE(subtests_cfg); i++) { cfg = &subtests_cfg[i]; @@ -711,4 +711,7 @@ void test_tc_tunnel(void) subtest_cleanup(cfg); } cleanup(); + +out: + test_tc_tunnel__destroy(skel); } From 2bb270a0ac68990648c5e84abf73bb7493230ea6 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:32 -0800 Subject: [PATCH 16/36] selftests/bpf: Free bpf_object in test_sysctl ASAN reported a resource leak due to the bpf_object not being tracked in test_sysctl. Add obj field to struct sysctl_test to properly clean it up. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-17-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/test_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/test_sysctl.c b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c index 273dd41ca09e..2a1ff821bc97 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_sysctl.c +++ b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c @@ -27,6 +27,7 @@ struct sysctl_test { OP_EPERM, SUCCESS, } result; + struct bpf_object *obj; }; static struct sysctl_test tests[] = { @@ -1471,6 +1472,7 @@ static int load_sysctl_prog_file(struct sysctl_test *test) return -1; } + test->obj = obj; return prog_fd; } @@ -1573,6 +1575,7 @@ out: /* Detaching w/o checking return code: best effort attempt. */ if (progfd != -1) bpf_prog_detach(cgfd, atype); + bpf_object__close(test->obj); close(progfd); printf("[%s]\n", err ? "FAIL" : "PASS"); return err; From 3e711c8e4707c46cc45d9775b50204d0f2790d77 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:07:33 -0800 Subject: [PATCH 17/36] selftests/bpf: Fix array bounds warning in jit_disasm_helpers Compiler cannot infer upper bound for labels.cnt and warns about potential buffer overflow in snprintf. Add an explicit bounds check (... && i < MAX_LOCAL_LABELS) in the loop condition to fix the warning. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223190736.649171-18-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/jit_disasm_helpers.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c index febd6b12e372..364c557c5115 100644 --- a/tools/testing/selftests/bpf/jit_disasm_helpers.c +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -122,15 +122,15 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) pc += cnt; } qsort(labels.pcs, labels.cnt, sizeof(*labels.pcs), cmp_u32); - for (i = 0; i < labels.cnt; ++i) - /* gcc is unable to infer upper bound for labels.cnt and assumes - * it to be U32_MAX. U32_MAX takes 10 decimal digits. - * snprintf below prints into labels.names[*], - * which has space only for two digits and a letter. - * To avoid truncation warning use (i % MAX_LOCAL_LABELS), - * which informs gcc about printed value upper bound. - */ - snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i % MAX_LOCAL_LABELS); + /* gcc is unable to infer upper bound for labels.cnt and + * assumes it to be U32_MAX. U32_MAX takes 10 decimal digits. + * snprintf below prints into labels.names[*], which has space + * only for two digits and a letter. To avoid truncation + * warning use (i < MAX_LOCAL_LABELS), which informs gcc about + * printed value upper bound. + */ + for (i = 0; i < labels.cnt && i < MAX_LOCAL_LABELS; ++i) + snprintf(labels.names[i], sizeof(labels.names[i]), "L%d", i); /* now print with labels */ labels.print_phase = true; From ad90ecedad755e2f3e31364ec3130e5bb2f4b64a Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:11:16 -0800 Subject: [PATCH 18/36] selftests/bpf: Fix out-of-bounds array access bugs reported by ASAN - kmem_cache_iter: remove unnecessary debug output - lwt_seg6local: change the type of foobar to char[] - the sizeof(foobar) returned the pointer size and not a string length as intended - verifier_log: increase prog_name buffer size in verif_log_subtest() - compiler has a conservative estimate of fixed_log_sz value, making ASAN complain on snprint() call Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223191118.655185-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c | 7 ++----- tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c | 2 +- tools/testing/selftests/bpf/prog_tests/verifier_log.c | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c index 6e35e13c2022..399fe9103f83 100644 --- a/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/kmem_cache_iter.c @@ -104,11 +104,8 @@ void test_kmem_cache_iter(void) if (!ASSERT_GE(iter_fd, 0, "iter_create")) goto destroy; - memset(buf, 0, sizeof(buf)); - while (read(iter_fd, buf, sizeof(buf)) > 0) { - /* Read out all contents */ - printf("%s", buf); - } + while (read(iter_fd, buf, sizeof(buf)) > 0) + ; /* Read out all contents */ /* Next reads should return 0 */ ASSERT_EQ(read(iter_fd, buf, sizeof(buf)), 0, "read"); diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c index 3bc730b7c7fa..1b25d5c5f8fb 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c @@ -117,7 +117,7 @@ void test_lwt_seg6local(void) const char *ns1 = NETNS_BASE "1"; const char *ns6 = NETNS_BASE "6"; struct nstoken *nstoken = NULL; - const char *foobar = "foobar"; + const char foobar[] = "foobar"; ssize_t bytes; int sfd, cfd; char buf[7]; diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_log.c b/tools/testing/selftests/bpf/prog_tests/verifier_log.c index 8337c6bc5b95..aaa2854974c0 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier_log.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier_log.c @@ -47,7 +47,7 @@ static int load_prog(struct bpf_prog_load_opts *opts, bool expect_load_error) static void verif_log_subtest(const char *name, bool expect_load_error, int log_level) { LIBBPF_OPTS(bpf_prog_load_opts, opts); - char *exp_log, prog_name[16], op_name[32]; + char *exp_log, prog_name[24], op_name[32]; struct test_log_buf *skel; struct bpf_program *prog; size_t fixed_log_sz; From a2714e730304c79bf85d2178387255ef8348b897 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:11:17 -0800 Subject: [PATCH 19/36] selftests/bpf: Check BPFTOOL env var in detect_bpftool_path() The bpftool_maps_access and bpftool_metadata tests may fail on BPF CI with "command not found", depending on a workflow. This happens because detect_bpftool_path() only checks two hardcoded relative paths: - ./tools/sbin/bpftool - ../tools/sbin/bpftool Add support for a BPFTOOL environment variable that allows specifying the exact path to the bpftool binary. Acked-by: Mykyta Yatsenko Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223191118.655185-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpftool_helpers.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c index 595a636fa13b..929fc257f431 100644 --- a/tools/testing/selftests/bpf/bpftool_helpers.c +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -14,6 +14,17 @@ static int detect_bpftool_path(char *buffer, size_t size) { char tmp[BPFTOOL_PATH_MAX_LEN]; + const char *env_path; + + /* First, check if BPFTOOL environment variable is set */ + env_path = getenv("BPFTOOL"); + if (env_path && access(env_path, X_OK) == 0) { + strscpy(buffer, env_path, size); + return 0; + } else if (env_path) { + fprintf(stderr, "bpftool '%s' doesn't exist or is not executable\n", env_path); + return 1; + } /* Check default bpftool location (will work if we are running the * default flavor of test_progs) @@ -33,7 +44,7 @@ static int detect_bpftool_path(char *buffer, size_t size) return 0; } - /* Failed to find bpftool binary */ + fprintf(stderr, "Failed to detect bpftool path, use BPFTOOL env var to override\n"); return 1; } From 4c9d07865c06bb9b9faff1ecf6c4b7cc8f7d67a9 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 23 Feb 2026 11:11:18 -0800 Subject: [PATCH 20/36] selftests/bpf: Don't override SIGSEGV handler with ASAN test_progs has custom SIGSEGV handler, which interferes with the address sanitizer [1]. Add an #ifndef to avoid this. Additionally, declare an __asan_on_error() to dump the test logs in the same way it happens in the custom SIGSEGV handler. [1] https://lore.kernel.org/bpf/73d832948b01dbc0ebc60d85574bdf8537f3a810.camel@gmail.com/ Acked-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260223191118.655185-3-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_progs.c | 36 +++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index d1418ec1f351..0929f4a7bda4 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1261,14 +1261,8 @@ int get_bpf_max_tramp_links(void) return ret; } -#define MAX_BACKTRACE_SZ 128 -void crash_handler(int signum) +static void dump_crash_log(void) { - void *bt[MAX_BACKTRACE_SZ]; - size_t sz; - - sz = backtrace(bt, ARRAY_SIZE(bt)); - fflush(stdout); stdout = env.stdout_saved; stderr = env.stderr_saved; @@ -1277,12 +1271,32 @@ void crash_handler(int signum) env.test_state->error_cnt++; dump_test_log(env.test, env.test_state, true, false, NULL); } +} + +#define MAX_BACKTRACE_SZ 128 + +void crash_handler(int signum) +{ + void *bt[MAX_BACKTRACE_SZ]; + size_t sz; + + sz = backtrace(bt, ARRAY_SIZE(bt)); + + dump_crash_log(); + if (env.worker_id != -1) fprintf(stderr, "[%d]: ", env.worker_id); fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum); backtrace_symbols_fd(bt, sz, STDERR_FILENO); } +#ifdef __SANITIZE_ADDRESS__ +void __asan_on_error(void) +{ + dump_crash_log(); +} +#endif + void hexdump(const char *prefix, const void *buf, size_t len) { for (int i = 0; i < len; i++) { @@ -1944,13 +1958,15 @@ int main(int argc, char **argv) .parser = parse_arg, .doc = argp_program_doc, }; + int err, i; + +#ifndef __SANITIZE_ADDRESS__ struct sigaction sigact = { .sa_handler = crash_handler, .sa_flags = SA_RESETHAND, - }; - int err, i; - + }; sigaction(SIGSEGV, &sigact, NULL); +#endif env.stdout_saved = stdout; env.stderr_saved = stderr; From ef06fd16d48704eac868441d98d4ef083d8f3d07 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 26 Feb 2026 07:55:25 +0000 Subject: [PATCH 21/36] bpf, arm64: Force 8-byte alignment for JIT buffer to prevent atomic tearing struct bpf_plt contains a u64 target field. Currently, the BPF JIT allocator requests an alignment of 4 bytes (sizeof(u32)) for the JIT buffer. Because the base address of the JIT buffer can be 4-byte aligned (e.g., ending in 0x4 or 0xc), the relative padding logic in build_plt() fails to ensure that target lands on an 8-byte boundary. This leads to two issues: 1. UBSAN reports misaligned-access warnings when dereferencing the structure. 2. More critically, target is updated concurrently via WRITE_ONCE() in bpf_arch_text_poke() while the JIT'd code executes ldr. On arm64, 64-bit loads/stores are only guaranteed to be single-copy atomic if they are 64-bit aligned. A misaligned target risks a torn read, causing the JIT to jump to a corrupted address. Fix this by increasing the allocation alignment requirement to 8 bytes (sizeof(u64)) in bpf_jit_binary_pack_alloc(). This anchors the base of the JIT buffer to an 8-byte boundary, allowing the relative padding math in build_plt() to correctly align the target field. Fixes: b2ad54e1533e ("bpf, arm64: Implement bpf_arch_text_poke() for arm64") Signed-off-by: Fuad Tabba Acked-by: Will Deacon Link: https://lore.kernel.org/r/20260226075525.233321-1-tabba@google.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 356d33c7a4ae..adf84962d579 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2119,7 +2119,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) extable_offset = round_up(prog_size + PLT_TARGET_SIZE, extable_align); image_size = extable_offset + extable_size; ro_header = bpf_jit_binary_pack_alloc(image_size, &ro_image_ptr, - sizeof(u32), &header, &image_ptr, + sizeof(u64), &header, &image_ptr, jit_fill_hole); if (!ro_header) { prog = orig_prog; From ad6fface76da42721c15e8fb281570aaa44a2c01 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 25 Feb 2026 12:12:49 +0100 Subject: [PATCH 22/36] bpf: Fix kprobe_multi cookies access in show_fdinfo callback We don't check if cookies are available on the kprobe_multi link before accessing them in show_fdinfo callback, we should. Cc: stable@vger.kernel.org Fixes: da7e9c0a7fbc ("bpf: Add show_fdinfo for kprobe_multi") Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260225111249.186230-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9bc0dfd235af..0b040a417442 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2454,8 +2454,10 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_kprobe_multi_link *kmulti_link; + bool has_cookies; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + has_cookies = !!kmulti_link->cookies; seq_printf(seq, "kprobe_cnt:\t%u\n" @@ -2467,7 +2469,7 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, for (int i = 0; i < kmulti_link->cnt; i++) { seq_printf(seq, "%llu\t %pS\n", - kmulti_link->cookies[i], + has_cookies ? kmulti_link->cookies[i] : 0, (void *)kmulti_link->addrs[i]); } } From b7bf516c3ecd9a2aae2dc2635178ab87b734fef1 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Wed, 25 Feb 2026 05:34:44 +0000 Subject: [PATCH 23/36] bpf: Fix stack-out-of-bounds write in devmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_upper_ifindexes() iterates over all upper devices and writes their indices into an array without checking bounds. Also the callers assume that the max number of upper devices is MAX_NEST_DEV and allocate excluded_devices[1+MAX_NEST_DEV] on the stack, but that assumption is not correct and the number of upper devices could be larger than MAX_NEST_DEV (e.g., many macvlans), causing a stack-out-of-bounds write. Add a max parameter to get_upper_ifindexes() to avoid the issue. When there are too many upper devices, return -EOVERFLOW and abort the redirect. To reproduce, create more than MAX_NEST_DEV(8) macvlans on a device with an XDP program attached using BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS. Then send a packet to the device to trigger the XDP redirect path. Reported-by: syzbot+10cc7f13760b31bd2e61@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/698c4ce3.050a0220.340abe.000b.GAE@google.com/T/ Fixes: aeea1b86f936 ("bpf, devmap: Exclude XDP broadcast to master device") Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Kohei Enju Link: https://lore.kernel.org/r/20260225053506.4738-1-kohei@enjuk.jp Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2625601de76e..2984e938f94d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -588,18 +588,22 @@ static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifin } /* Get ifindex of each upper device. 'indexes' must be able to hold at - * least MAX_NEST_DEV elements. - * Returns the number of ifindexes added. + * least 'max' elements. + * Returns the number of ifindexes added, or -EOVERFLOW if there are too + * many upper devices. */ -static int get_upper_ifindexes(struct net_device *dev, int *indexes) +static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (n >= max) + return -EOVERFLOW; indexes[n++] = upper->ifindex; } + return n; } @@ -615,7 +619,11 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, int err; if (exclude_ingress) { - num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); + num_excluded = get_upper_ifindexes(dev_rx, excluded_devices, + ARRAY_SIZE(excluded_devices) - 1); + if (num_excluded < 0) + return num_excluded; + excluded_devices[num_excluded++] = dev_rx->ifindex; } @@ -733,7 +741,11 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, int err; if (exclude_ingress) { - num_excluded = get_upper_ifindexes(dev, excluded_devices); + num_excluded = get_upper_ifindexes(dev, excluded_devices, + ARRAY_SIZE(excluded_devices) - 1); + if (num_excluded < 0) + return num_excluded; + excluded_devices[num_excluded++] = dev->ifindex; } From 60e3cbef3500ad6101bc91946e645f69b1bb9556 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 24 Feb 2026 16:33:51 -0800 Subject: [PATCH 24/36] selftests/bpf: Fix a memory leak in xdp_flowtable test test_progs run with ASAN reported [1]: ==126==ERROR: LeakSanitizer: detected memory leaks Direct leak of 32 byte(s) in 1 object(s) allocated from: #0 0x7f1ff3cfa340 in calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:77 #1 0x5610c15bb520 in bpf_program_attach_fd /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/lib/bpf/libbpf.c:13164 #2 0x5610c15bb740 in bpf_program__attach_xdp /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/lib/bpf/libbpf.c:13204 #3 0x5610c14f91d3 in test_xdp_flowtable /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c:138 #4 0x5610c1533566 in run_one_test /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/testing/selftests/bpf/test_progs.c:1406 #5 0x5610c1537fb0 in main /codebuild/output/src685977285/src/actions-runner/_work/vmtest/vmtest/src/tools/testing/selftests/bpf/test_progs.c:2097 #6 0x7f1ff25df1c9 (/lib/x86_64-linux-gnu/libc.so.6+0x2a1c9) (BuildId: 8e9fd827446c24067541ac5390e6f527fb5947bb) #7 0x7f1ff25df28a in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x2a28a) (BuildId: 8e9fd827446c24067541ac5390e6f527fb5947bb) #8 0x5610c0bd3180 in _start (/tmp/work/vmtest/vmtest/selftests/bpf/test_progs+0x593180) (BuildId: cdf9f103f42307dc0a2cd6cfc8afcbc1366cf8bd) Fix by properly destroying bpf_link on exit in xdp_flowtable test. [1] https://github.com/kernel-patches/vmtest/actions/runs/22361085418/job/64716490680 Signed-off-by: Ihor Solodrai Reviewed-by: Subbaraya Sundeep Link: https://lore.kernel.org/r/20260225003351.465104-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c b/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c index 3f9146d83d79..325e0b64dc35 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_flowtable.c @@ -67,7 +67,7 @@ void test_xdp_flowtable(void) struct nstoken *tok = NULL; int iifindex, stats_fd; __u32 value, key = 0; - struct bpf_link *link; + struct bpf_link *link = NULL; if (SYS_NOFAIL("nft -v")) { fprintf(stdout, "Missing required nft tool\n"); @@ -160,6 +160,7 @@ void test_xdp_flowtable(void) ASSERT_GE(value, N_PACKETS - 2, "bpf_xdp_flow_lookup failed"); out: + bpf_link__destroy(link); xdp_flowtable__destroy(skel); if (tok) close_netns(tok); From 6881af27f9ea0f5ca8f606f573ef5cc25ca31fe4 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Tue, 24 Feb 2026 16:33:48 -0800 Subject: [PATCH 25/36] selftests/bpf: Fix OOB read in dmabuf_collector Dmabuf name allocations can be less than DMA_BUF_NAME_LEN characters, but bpf_probe_read_kernel always tries to read exactly that many bytes. If a name is less than DMA_BUF_NAME_LEN characters, bpf_probe_read_kernel will read past the end. bpf_probe_read_kernel_str stops at the first NUL terminator so use it instead, like iter_dmabuf_for_each already does. Fixes: ae5d2c59ecd7 ("selftests/bpf: Add test for dmabuf_iter") Reported-by: Jerome Lee Signed-off-by: T.J. Mercier Link: https://lore.kernel.org/r/20260225003349.113746-1-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/dmabuf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/dmabuf_iter.c b/tools/testing/selftests/bpf/progs/dmabuf_iter.c index 13cdb11fdeb2..9cbb7442646e 100644 --- a/tools/testing/selftests/bpf/progs/dmabuf_iter.c +++ b/tools/testing/selftests/bpf/progs/dmabuf_iter.c @@ -48,7 +48,7 @@ int dmabuf_collector(struct bpf_iter__dmabuf *ctx) /* Buffers are not required to be named */ if (pname) { - if (bpf_probe_read_kernel(name, sizeof(name), pname)) + if (bpf_probe_read_kernel_str(name, sizeof(name), pname) < 0) return 1; /* Name strings can be provided by userspace */ From 1df97a7453eec80c1912c2d0360290a3970a7671 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:01 -0800 Subject: [PATCH 26/36] bpf: Register dtor for freeing special fields There is a race window where BPF hash map elements can leak special fields if the program with access to the map value recreates these special fields between the check_and_free_fields done on the map value and its eventual return to the memory allocator. Several ways were explored prior to this patch, most notably [0] tried to use a poison value to reject attempts to recreate special fields for map values that have been logically deleted but still accessible to BPF programs (either while sitting in the free list or when reused). While this approach works well for task work, timers, wq, etc., it is harder to apply the idea to kptrs, which have a similar race and failure mode. Instead, we change bpf_mem_alloc to allow registering destructor for allocated elements, such that when they are returned to the allocator, any special fields created while they were accessible to programs in the mean time will be freed. If these values get reused, we do not free the fields again before handing the element back. The special fields thus may remain initialized while the map value sits in a free list. When bpf_mem_alloc is retired in the future, a similar concept can be introduced to kmalloc_nolock-backed kmem_cache, paired with the existing idea of a constructor. Note that the destructor registration happens in map_check_btf, after the BTF record is populated and (at that point) avaiable for inspection and duplication. Duplication is necessary since the freeing of embedded bpf_mem_alloc can be decoupled from actual map lifetime due to logic introduced to reduce the cost of rcu_barrier()s in mem alloc free path in 9f2c6e96c65e ("bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc."). As such, once all callbacks are done, we must also free the duplicated record. To remove dependency on the bpf_map itself, also stash the key size of the map to obtain value from htab_elem long after the map is gone. [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr") Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Reported-by: Alexei Starovoitov Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 6 +++ kernel/bpf/hashtab.c | 87 +++++++++++++++++++++++++++++++++++ kernel/bpf/memalloc.c | 58 ++++++++++++++++++----- 3 files changed, 140 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index e45162ef59bb..4ce0d27f8ea2 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,8 @@ struct bpf_mem_alloc { struct obj_cgroup *objcg; bool percpu; struct work_struct work; + void (*dtor_ctx_free)(void *ctx); + void *dtor_ctx; }; /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. @@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg /* The percpu allocation with a specific unit size. */ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, + void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), + void *ctx); /* Check the allocation size for kmalloc equivalent allocator */ int bpf_mem_alloc_check_size(bool percpu, size_t size); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3b9d297a53be..582f0192b7e1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -125,6 +125,11 @@ struct htab_elem { char key[] __aligned(8); }; +struct htab_btf_record { + struct btf_record *record; + u32 key_size; +}; + static inline bool htab_is_prealloc(const struct bpf_htab *htab) { return !(htab->map.map_flags & BPF_F_NO_PREALLOC); @@ -457,6 +462,84 @@ static int htab_map_alloc_check(union bpf_attr *attr) return 0; } +static void htab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct htab_elem *elem = obj; + void *map_value; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + map_value = htab_elem_value(elem, hrec->key_size); + bpf_obj_free_fields(hrec->record, map_value); +} + +static void htab_pcpu_mem_dtor(void *obj, void *ctx) +{ + void __percpu *pptr = *(void __percpu **)obj; + struct htab_btf_record *hrec = ctx; + int cpu; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + for_each_possible_cpu(cpu) + bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu)); +} + +static void htab_dtor_ctx_free(void *ctx) +{ + struct htab_btf_record *hrec = ctx; + + btf_record_free(hrec->record); + kfree(ctx); +} + +static int htab_set_dtor(const struct bpf_htab *htab, void (*dtor)(void *, void *)) +{ + u32 key_size = htab->map.key_size; + const struct bpf_mem_alloc *ma; + struct htab_btf_record *hrec; + int err; + + /* No need for dtors. */ + if (IS_ERR_OR_NULL(htab->map.record)) + return 0; + + hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); + if (!hrec) + return -ENOMEM; + hrec->key_size = key_size; + hrec->record = btf_record_dup(htab->map.record); + if (IS_ERR(hrec->record)) { + err = PTR_ERR(hrec->record); + kfree(hrec); + return err; + } + ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; + /* Kinda sad, but cast away const-ness since we change ma->dtor. */ + bpf_mem_alloc_set_dtor((struct bpf_mem_alloc *)ma, dtor, htab_dtor_ctx_free, hrec); + return 0; +} + +static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + if (htab_is_prealloc(htab)) + return 0; + /* + * We must set the dtor using this callback, as map's BTF record is not + * populated in htab_map_alloc(), so it will always appear as NULL. + */ + if (htab_is_percpu(htab)) + return htab_set_dtor(htab, htab_pcpu_mem_dtor); + else + return htab_set_dtor(htab, htab_mem_dtor); +} + static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -2281,6 +2364,7 @@ const struct bpf_map_ops htab_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], @@ -2303,6 +2387,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], @@ -2482,6 +2567,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], @@ -2502,6 +2588,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index bd45dda9dc35..682a9f34214b 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -102,6 +102,8 @@ struct bpf_mem_cache { int percpu_size; bool draining; struct bpf_mem_cache *tgt; + void (*dtor)(void *obj, void *ctx); + void *dtor_ctx; /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; @@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu) kfree(obj); } -static int free_all(struct llist_node *llnode, bool percpu) +static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; int cnt = 0; llist_for_each_safe(pos, t, llnode) { + if (c->dtor) + c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx); free_one(pos, percpu); cnt++; } @@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); atomic_set(&c->call_rcu_ttrace_in_progress, 0); } @@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c) if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { if (unlikely(READ_ONCE(c->draining))) { llnode = llist_del_all(&c->free_by_rcu_ttrace); - free_all(llnode, !!c->percpu_size); + free_all(c, llnode, !!c->percpu_size); } return; } @@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c) dec_active(c, &flags); if (unlikely(READ_ONCE(c->draining))) { - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } else { call_rcu_hurry(&c->rcu, __free_by_rcu); @@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c) * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); - free_all(__llist_del_all(&c->free_llist), percpu); - free_all(__llist_del_all(&c->free_llist_extra), percpu); - free_all(__llist_del_all(&c->free_by_rcu), percpu); - free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); - free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu); + free_all(c, __llist_del_all(&c->free_llist), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra), percpu); + free_all(c, __llist_del_all(&c->free_by_rcu), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp), percpu); } static void check_mem_cache(struct bpf_mem_cache *c) @@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma) static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { + /* We can free dtor ctx only once all callbacks are done using it. */ + if (ma->dtor_ctx_free) + ma->dtor_ctx_free(ma->dtor_ctx); check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); @@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size) return 0; } + +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), void *ctx) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + ma->dtor_ctx_free = dtor_ctx_free; + ma->dtor_ctx = ctx; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + } +} From ae51772b1e94ba1d76db19085957dbccac189c1c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:02 -0800 Subject: [PATCH 27/36] bpf: Lose const-ness of map in map_check_btf() BPF hash map may now use the map_check_btf() callback to decide whether to set a dtor on its bpf_mem_alloc or not. Unlike C++ where members can opt out of const-ness using mutable, we must lose the const qualifier on the callback such that we can avoid the ugly cast. Make the change and adjust all existing users, and lose the comment in hashtab.c. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/arena.c | 2 +- kernel/bpf/arraymap.c | 2 +- kernel/bpf/bloom_filter.c | 2 +- kernel/bpf/bpf_insn_array.c | 2 +- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/hashtab.c | 9 ++++----- kernel/bpf/local_storage.c | 2 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/syscall.c | 2 +- 11 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b78b53198a2e..05b34a6355b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -124,7 +124,7 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, + int (*map_check_btf)(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); @@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) map->ops->map_seq_show_elem; } -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 85efa9772530..8157e8da61d4 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 144f30e740e8..f355cf1c1a16 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key, return -EOPNOTSUPP; } -static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return 0; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26763df6134a..33de68c95ad8 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, +static int array_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 35e1ddca74d2..b73336c976b7 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key, return -EINVAL; } -static int bloom_map_check_btf(const struct bpf_map *map, +static int bloom_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index c0286f25ca3c..a2f84afe6f7c 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int insn_array_check_btf(const struct bpf_map *map, +static int insn_array_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b28f07d3a0db..2bf5ca5ae0df 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -797,7 +797,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 582f0192b7e1..bc6bc8bb871d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -496,10 +496,10 @@ static void htab_dtor_ctx_free(void *ctx) kfree(ctx); } -static int htab_set_dtor(const struct bpf_htab *htab, void (*dtor)(void *, void *)) +static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) { u32 key_size = htab->map.key_size; - const struct bpf_mem_alloc *ma; + struct bpf_mem_alloc *ma; struct htab_btf_record *hrec; int err; @@ -518,12 +518,11 @@ static int htab_set_dtor(const struct bpf_htab *htab, void (*dtor)(void *, void return err; } ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; - /* Kinda sad, but cast away const-ness since we change ma->dtor. */ - bpf_mem_alloc_set_dtor((struct bpf_mem_alloc *)ma, dtor, htab_dtor_ctx_free, hrec); + bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); return 0; } -static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 1ccbf28b2ad9..8fca0c64f7b1 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int cgroup_storage_check_btf(const struct bpf_map *map, +static int cgroup_storage_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1adeb4d3b8cf..0f57608b385d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -751,7 +751,7 @@ free_stack: return err; } -static int trie_check_btf(const struct bpf_map *map, +static int trie_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0378e83b4099..274039e36465 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1234,7 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) } EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) From f41deee082dc7b1c198de21710cb5cb9c604cae0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:03 -0800 Subject: [PATCH 28/36] bpf: Delay freeing fields in local storage Currently, when use_kmalloc_nolock is false, the freeing of fields for a local storage selem is done eagerly before waiting for the RCU or RCU tasks trace grace period to elapse. This opens up a window where the program which has access to the selem can recreate the fields after the freeing of fields is done eagerly, causing memory leaks when the element is finally freed and returned to the kernel. Make a few changes to address this. First, delay the freeing of fields until after the grace periods have expired using a __bpf_selem_free_rcu wrapper which is eventually invoked after transitioning through the necessary number of grace period waits. Replace usage of the kfree_rcu with call_rcu to be able to take a custom callback. Finally, care needs to be taken to extend the rcu barriers for all cases, and not just when use_kmalloc_nolock is true, as RCU and RCU tasks trace callbacks can be in flight for either case and access the smap field, which is used to obtain the BTF record to walk over special fields in the map value. While we're at it, drop migrate_disable() from bpf_selem_free_rcu, since migration should be disabled for RCU callbacks already. Fixes: 9bac675e6368 ("bpf: Postpone bpf_obj_free_fields to the rcu callback") Reviewed-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 42 ++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 2bf5ca5ae0df..d7db1ada2dad 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -164,16 +164,28 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, bpf_local_storage_free_trace_rcu); } +/* rcu callback for use_kmalloc_nolock == false */ +static void __bpf_selem_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */ + smap = rcu_dereference_check(SDATA(selem)->smap, 1); + + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + kfree(selem); +} + /* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { - struct bpf_local_storage_elem *selem; - - selem = container_of(rcu, struct bpf_local_storage_elem, rcu); if (rcu_trace_implies_rcu_gp()) - kfree(selem); + __bpf_selem_free_rcu(rcu); else - kfree_rcu(selem, rcu); + call_rcu(rcu, __bpf_selem_free_rcu); } /* Handle use_kmalloc_nolock == false */ @@ -181,7 +193,7 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { if (vanilla_rcu) - kfree_rcu(selem, rcu); + call_rcu(&selem->rcu, __bpf_selem_free_rcu); else call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); } @@ -195,11 +207,8 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - if (smap) { - migrate_disable(); + if (smap) bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); - } kfree_nolock(selem); } @@ -214,18 +223,12 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) void bpf_selem_free(struct bpf_local_storage_elem *selem, bool reuse_now) { - struct bpf_local_storage_map *smap; - - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - if (smap) - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -958,10 +961,9 @@ restart: */ synchronize_rcu(); - if (smap->use_kmalloc_nolock) { - rcu_barrier_tasks_trace(); - rcu_barrier(); - } + /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */ + rcu_barrier_tasks_trace(); + rcu_barrier(); kvfree(smap->buckets); bpf_map_area_free(smap); } From baa35b3cb6b642b903162aacff13181e170c4ecc Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:04 -0800 Subject: [PATCH 29/36] bpf: Retire rcu_trace_implies_rcu_gp() from local storage This assumption will always hold going forward, hence just remove the various checks and assume it is true with a comment for the uninformed reader. Reviewed-by: Paul E. McKenney Reviewed-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 37 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index d7db1ada2dad..9c96a4477f81 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; - /* If RCU Tasks Trace grace period implies RCU grace period, do - * kfree(), else do kfree_rcu(). + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); - if (rcu_trace_implies_rcu_gp()) - kfree(local_storage); - else - kfree_rcu(local_storage, rcu); + kfree(local_storage); } /* Handle use_kmalloc_nolock == false */ @@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu) static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - bpf_local_storage_free_rcu(rcu); - else - call_rcu(rcu, bpf_local_storage_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + bpf_local_storage_free_rcu(rcu); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, @@ -182,10 +181,11 @@ static void __bpf_selem_free_rcu(struct rcu_head *rcu) /* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - __bpf_selem_free_rcu(rcu); - else - call_rcu(rcu, __bpf_selem_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + __bpf_selem_free_rcu(rcu); } /* Handle use_kmalloc_nolock == false */ @@ -214,10 +214,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - bpf_selem_free_rcu(rcu); - else - call_rcu(rcu, bpf_selem_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + bpf_selem_free_rcu(rcu); } void bpf_selem_free(struct bpf_local_storage_elem *selem, From 2939d7b3b0e5f35359ce8f69dbbad0bfc4e920b6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 27 Feb 2026 14:48:05 -0800 Subject: [PATCH 30/36] selftests/bpf: Add tests for special fields races Add a couple of tests to ensure that the refcount drops to zero when we exercise the race where creation of a special field succeeds the logical bpf_obj_free_fields done when deleting an element. Prior to previous changes, the fields would be freed eagerly and repopulate and end up leaking, causing the reference to not drop down correctly. Running this test on a kernel without fixes will cause a hang in delete_module, since the module reference stays active due to the leaked kptr not dropping it. After the fixes tests succeed as expected. Reviewed-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260227224806.646888-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/map_kptr_race.c | 218 ++++++++++++++++++ .../selftests/bpf/progs/map_kptr_race.c | 197 ++++++++++++++++ 2 files changed, 415 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_kptr_race.c create mode 100644 tools/testing/selftests/bpf/progs/map_kptr_race.c diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c new file mode 100644 index 000000000000..506ed55e8528 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#include "map_kptr_race.skel.h" + +static int get_map_id(int map_fd) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + + if (!ASSERT_OK(bpf_map_get_info_by_fd(map_fd, &info, &len), "get_map_info")) + return -1; + return info.id; +} + +static int read_refs(struct map_kptr_race *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &opts); + if (!ASSERT_OK(ret, "count_ref run")) + return -1; + if (!ASSERT_OK(opts.retval, "count_ref retval")) + return -1; + return skel->bss->num_of_refs; +} + +static void test_htab_leak(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct map_kptr_race *skel, *watcher; + int ret, map_id; + + skel = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_htab_leak), &opts); + if (!ASSERT_OK(ret, "test_htab_leak run")) + goto out_skel; + if (!ASSERT_OK(opts.retval, "test_htab_leak retval")) + goto out_skel; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_hash_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free); + if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "htab refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + map_kptr_race__destroy(skel); +} + +static void test_percpu_htab_leak(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct map_kptr_race *skel, *watcher; + int ret, map_id; + + skel = map_kptr_race__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + if (skel->rodata->nr_cpus > 16) + skel->rodata->nr_cpus = 16; + + ret = map_kptr_race__load(skel); + if (!ASSERT_OK(ret, "load")) + goto out_skel; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_percpu_htab_leak), &opts); + if (!ASSERT_OK(ret, "test_percpu_htab_leak run")) + goto out_skel; + if (!ASSERT_OK(opts.retval, "test_percpu_htab_leak retval")) + goto out_skel; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_percpu_hash_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free); + if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "percpu_htab refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + map_kptr_race__destroy(skel); +} + +static void test_sk_ls_leak(void) +{ + struct map_kptr_race *skel, *watcher; + int listen_fd = -1, client_fd = -1, map_id; + + skel = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (!ASSERT_OK(map_kptr_race__attach(skel), "attach")) + goto out_skel; + + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(listen_fd, 0, "start_server")) + goto out_skel; + + client_fd = connect_to_fd(listen_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto out_skel; + + if (!ASSERT_EQ(skel->bss->sk_ls_leak_done, 1, "sk_ls_leak_done")) + goto out_skel; + + close(client_fd); + client_fd = -1; + close(listen_fd); + listen_fd = -1; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_sk_ls_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.sk_map_free = bpf_program__attach(watcher->progs.sk_map_free); + if (!ASSERT_OK_PTR(watcher->links.sk_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "sk_ls refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + if (client_fd >= 0) + close(client_fd); + if (listen_fd >= 0) + close(listen_fd); + map_kptr_race__destroy(skel); +} + +void serial_test_map_kptr_race(void) +{ + if (test__start_subtest("htab_leak")) + test_htab_leak(); + if (test__start_subtest("percpu_htab_leak")) + test_percpu_htab_leak(); + if (test__start_subtest("sk_ls_leak")) + test_sk_ls_leak(); +} diff --git a/tools/testing/selftests/bpf/progs/map_kptr_race.c b/tools/testing/selftests/bpf/progs/map_kptr_race.c new file mode 100644 index 000000000000..f6f136cd8f60 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_kptr_race.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "../test_kmods/bpf_testmod_kfunc.h" + +struct map_value { + struct prog_test_ref_kfunc __kptr *ref_ptr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} race_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} race_percpu_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); +} race_sk_ls_map SEC(".maps"); + +int num_of_refs; +int sk_ls_leak_done; +int target_map_id; +int map_freed; +const volatile int nr_cpus; + +SEC("tc") +int test_htab_leak(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *p, *old; + struct map_value val = {}; + struct map_value *v; + int key = 0; + + if (bpf_map_update_elem(&race_hash_map, &key, &val, BPF_ANY)) + return 1; + + v = bpf_map_lookup_elem(&race_hash_map, &key); + if (!v) + return 2; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 3; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + bpf_map_delete_elem(&race_hash_map, &key); + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 4; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + return 0; +} + +static int fill_percpu_kptr(struct map_value *v) +{ + struct prog_test_ref_kfunc *p, *old; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 1; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + return 0; +} + +SEC("tc") +int test_percpu_htab_leak(struct __sk_buff *skb) +{ + struct map_value *v, *arr[16] = {}; + struct map_value val = {}; + int key = 0; + int err = 0; + + if (bpf_map_update_elem(&race_percpu_hash_map, &key, &val, BPF_ANY)) + return 1; + + for (int i = 0; i < nr_cpus; i++) { + v = bpf_map_lookup_percpu_elem(&race_percpu_hash_map, &key, i); + if (!v) + return 2; + arr[i] = v; + } + + bpf_map_delete_elem(&race_percpu_hash_map, &key); + + for (int i = 0; i < nr_cpus; i++) { + v = arr[i]; + err = fill_percpu_kptr(v); + if (err) + return 3; + } + + return 0; +} + +SEC("tp_btf/inet_sock_set_state") +int BPF_PROG(test_sk_ls_leak, struct sock *sk, int oldstate, int newstate) +{ + struct prog_test_ref_kfunc *p, *old; + struct map_value *v; + + if (newstate != BPF_TCP_SYN_SENT) + return 0; + + if (sk_ls_leak_done) + return 0; + + v = bpf_sk_storage_get(&race_sk_ls_map, sk, NULL, + BPF_SK_STORAGE_GET_F_CREATE); + if (!v) + return 0; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 0; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + bpf_sk_storage_delete(&race_sk_ls_map, sk); + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 0; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + sk_ls_leak_done = 1; + return 0; +} + +long target_map_ptr; + +SEC("fentry/bpf_map_put") +int BPF_PROG(map_put, struct bpf_map *map) +{ + if (target_map_id && map->id == (u32)target_map_id) + target_map_ptr = (long)map; + return 0; +} + +SEC("fexit/htab_map_free") +int BPF_PROG(htab_map_free, struct bpf_map *map) +{ + if (target_map_ptr && (long)map == target_map_ptr) + map_freed = 1; + return 0; +} + +SEC("fexit/bpf_sk_storage_map_free") +int BPF_PROG(sk_map_free, struct bpf_map *map) +{ + if (target_map_ptr && (long)map == target_map_ptr) + map_freed = 1; + return 0; +} + +SEC("syscall") +int count_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + num_of_refs = p->cnt.refs.counter; + + bpf_kfunc_call_test_release(p); + return 0; +} + +char _license[] SEC("license") = "GPL"; From 869c63d5975d55e97f6b168e885452b3da20ea47 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 25 Feb 2026 20:14:55 +0800 Subject: [PATCH 31/36] bpf: Fix race in cpumap on PREEMPT_RT On PREEMPT_RT kernels, the per-CPU xdp_bulk_queue (bq) can be accessed concurrently by multiple preemptible tasks on the same CPU. The original code assumes bq_enqueue() and __cpu_map_flush() run atomically with respect to each other on the same CPU, relying on local_bh_disable() to prevent preemption. However, on PREEMPT_RT, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption, which allows CFS scheduling to preempt a task during bq_flush_to_queue(), enabling another task on the same CPU to enter bq_enqueue() and operate on the same per-CPU bq concurrently. This leads to several races: 1. Double __list_del_clearprev(): after bq->count is reset in bq_flush_to_queue(), a preempting task can call bq_enqueue() -> bq_flush_to_queue() on the same bq when bq->count reaches CPU_MAP_BULK_SIZE. Both tasks then call __list_del_clearprev() on the same bq->flush_node, the second call dereferences the prev pointer that was already set to NULL by the first. 2. bq->count and bq->q[] races: concurrent bq_enqueue() can corrupt the packet queue while bq_flush_to_queue() is processing it. The race between task A (__cpu_map_flush -> bq_flush_to_queue) and task B (bq_enqueue -> bq_flush_to_queue) on the same CPU: Task A (xdp_do_flush) Task B (cpu_map_enqueue) ---------------------- ------------------------ bq_flush_to_queue(bq) spin_lock(&q->producer_lock) /* flush bq->q[] to ptr_ring */ bq->count = 0 spin_unlock(&q->producer_lock) bq_enqueue(rcpu, xdpf) <-- CFS preempts Task A --> bq->q[bq->count++] = xdpf /* ... more enqueues until full ... */ bq_flush_to_queue(bq) spin_lock(&q->producer_lock) /* flush to ptr_ring */ spin_unlock(&q->producer_lock) __list_del_clearprev(flush_node) /* sets flush_node.prev = NULL */ <-- Task A resumes --> __list_del_clearprev(flush_node) flush_node.prev->next = ... /* prev is NULL -> kernel oops */ Fix this by adding a local_lock_t to xdp_bulk_queue and acquiring it in bq_enqueue() and __cpu_map_flush(). These paths already run under local_bh_disable(), so use local_lock_nested_bh() which on non-RT is a pure annotation with no overhead, and on PREEMPT_RT provides a per-CPU sleeping lock that serializes access to the bq. To reproduce, insert an mdelay(100) between bq->count = 0 and __list_del_clearprev() in bq_flush_to_queue(), then run reproducer provided by syzkaller. Fixes: 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Reported-by: syzbot+2b3391f44313b3983e91@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69369331.a70a0220.38f243.009d.GAE@google.com/T/ Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260225121459.183121-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 04171fbc39cb..32b43cb9061b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ struct xdp_bulk_queue { struct list_head flush_node; struct bpf_cpu_map_entry *obj; unsigned int count; + local_lock_t bq_lock; }; /* Struct for every remote "destination" CPU in map */ @@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, for_each_possible_cpu(i) { bq = per_cpu_ptr(rcpu->bulkq, i); bq->obj = rcpu; + local_lock_init(&bq->bq_lock); } /* Alloc queue */ @@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) struct ptr_ring *q; int i; + lockdep_assert_held(&bq->bq_lock); + if (unlikely(!bq->count)) return; @@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) } /* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. + * Thus, safe percpu variable access. PREEMPT_RT relies on + * local_lock_nested_bh() to serialise access to the per-CPU bq. */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + struct xdp_bulk_queue *bq; + + local_lock_nested_bh(&rcpu->bulkq->bq_lock); + bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) bq_flush_to_queue(bq); @@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) list_add(&bq->flush_node, flush_list); } + + local_unlock_nested_bh(&rcpu->bulkq->bq_lock); } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list) struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + local_lock_nested_bh(&bq->obj->bulkq->bq_lock); bq_flush_to_queue(bq); + local_unlock_nested_bh(&bq->obj->bulkq->bq_lock); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); From 1872e75375c40add4a35990de3be77b5741c252c Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 25 Feb 2026 20:14:56 +0800 Subject: [PATCH 32/36] bpf: Fix race in devmap on PREEMPT_RT On PREEMPT_RT kernels, the per-CPU xdp_dev_bulk_queue (bq) can be accessed concurrently by multiple preemptible tasks on the same CPU. The original code assumes bq_enqueue() and __dev_flush() run atomically with respect to each other on the same CPU, relying on local_bh_disable() to prevent preemption. However, on PREEMPT_RT, local_bh_disable() only calls migrate_disable() (when PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable preemption, which allows CFS scheduling to preempt a task during bq_xmit_all(), enabling another task on the same CPU to enter bq_enqueue() and operate on the same per-CPU bq concurrently. This leads to several races: 1. Double-free / use-after-free on bq->q[]: bq_xmit_all() snapshots cnt = bq->count, then iterates bq->q[0..cnt-1] to transmit frames. If preempted after the snapshot, a second task can call bq_enqueue() -> bq_xmit_all() on the same bq, transmitting (and freeing) the same frames. When the first task resumes, it operates on stale pointers in bq->q[], causing use-after-free. 2. bq->count and bq->q[] corruption: concurrent bq_enqueue() modifying bq->count and bq->q[] while bq_xmit_all() is reading them. 3. dev_rx/xdp_prog teardown race: __dev_flush() clears bq->dev_rx and bq->xdp_prog after bq_xmit_all(). If preempted between bq_xmit_all() return and bq->dev_rx = NULL, a preempting bq_enqueue() sees dev_rx still set (non-NULL), skips adding bq to the flush_list, and enqueues a frame. When __dev_flush() resumes, it clears dev_rx and removes bq from the flush_list, orphaning the newly enqueued frame. 4. __list_del_clearprev() on flush_node: similar to the cpumap race, both tasks can call __list_del_clearprev() on the same flush_node, the second dereferences the prev pointer already set to NULL. The race between task A (__dev_flush -> bq_xmit_all) and task B (bq_enqueue -> bq_xmit_all) on the same CPU: Task A (xdp_do_flush) Task B (ndo_xdp_xmit redirect) ---------------------- -------------------------------- __dev_flush(flush_list) bq_xmit_all(bq) cnt = bq->count /* e.g. 16 */ /* start iterating bq->q[] */ <-- CFS preempts Task A --> bq_enqueue(dev, xdpf) bq->count == DEV_MAP_BULK_SIZE bq_xmit_all(bq, 0) cnt = bq->count /* same 16! */ ndo_xdp_xmit(bq->q[]) /* frames freed by driver */ bq->count = 0 <-- Task A resumes --> ndo_xdp_xmit(bq->q[]) /* use-after-free: frames already freed! */ Fix this by adding a local_lock_t to xdp_dev_bulk_queue and acquiring it in bq_enqueue() and __dev_flush(). These paths already run under local_bh_disable(), so use local_lock_nested_bh() which on non-RT is a pure annotation with no overhead, and on PREEMPT_RT provides a per-CPU sleeping lock that serializes access to the bq. Fixes: 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Reported-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Jiayuan Chen Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260225121459.183121-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2984e938f94d..3d619d01088e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -45,6 +45,7 @@ * types of devmap; only the lookup and insertion is different. */ #include +#include #include #include #include @@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue { struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; + local_lock_t bq_lock; }; struct bpf_dtab_netdev { @@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) int to_send = cnt; int i; + lockdep_assert_held(&bq->bq_lock); + if (unlikely(!cnt)) return; @@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list) struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); + local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); } } @@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above - * xdp_do_flush() in filter.c. + * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh() + * to serialise access to the per-CPU bq. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); + struct xdp_dev_bulk_queue *bq; + + local_lock_nested_bh(&dev->xdp_bulkq->bq_lock); + bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, } bq->q[bq->count++] = xdpf; + + local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock); } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, @@ -1127,8 +1139,13 @@ static int dev_map_notification(struct notifier_block *notifier, if (!netdev->xdp_bulkq) return NOTIFY_BAD; - for_each_possible_cpu(cpu) - per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + for_each_possible_cpu(cpu) { + struct xdp_dev_bulk_queue *bq; + + bq = per_cpu_ptr(netdev->xdp_bulkq, cpu); + bq->dev = netdev; + local_lock_init(&bq->bq_lock); + } break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because From 76e954155b45294c502e3d3a9e15757c858ca55e Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Fri, 27 Feb 2026 22:32:21 +0100 Subject: [PATCH 33/36] bpf: Introduce tnum_step to step through tnum's members This commit introduces tnum_step(), a function that, when given t, and a number z returns the smallest member of t larger than z. The number z must be greater or equal to the smallest member of t and less than the largest member of t. The first step is to compute j, a number that keeps all of t's known bits, and matches all unknown bits to z's bits. Since j is a member of the t, it is already a candidate for result. However, we want our result to be (minimally) greater than z. There are only two possible cases: (1) Case j <= z. In this case, we want to increase the value of j and make it > z. (2) Case j > z. In this case, we want to decrease the value of j while keeping it > z. (Case 1) j <= z t = xx11x0x0 z = 10111101 (189) j = 10111000 (184) ^ k (Case 1.1) Let's first consider the case where j < z. We will address j == z later. Since z > j, there had to be a bit position that was 1 in z and a 0 in j, beyond which all positions of higher significance are equal in j and z. Further, this position could not have been unknown in a, because the unknown positions of a match z. This position had to be a 1 in z and known 0 in t. Let k be position of the most significant 1-to-0 flip. In our example, k = 3 (starting the count at 1 at the least significant bit). Setting (to 1) the unknown bits of t in positions of significance smaller than k will not produce a result > z. Hence, we must set/unset the unknown bits at positions of significance higher than k. Specifically, we look for the next larger combination of 1s and 0s to place in those positions, relative to the combination that exists in z. We can achieve this by concatenating bits at unknown positions of t into an integer, adding 1, and writing the bits of that result back into the corresponding bit positions previously extracted from z. >From our example, considering only positions of significance greater than k: t = xx..x z = 10..1 + 1 ----- 11..0 This is the exact combination 1s and 0s we need at the unknown bits of t in positions of significance greater than k. Further, our result must only increase the value minimally above z. Hence, unknown bits in positions of significance smaller than k should remain 0. We finally have, result = 11110000 (240) (Case 1.2) Now consider the case when j = z, for example t = 1x1x0xxx z = 10110100 (180) j = 10110100 (180) Matching the unknown bits of the t to the bits of z yielded exactly z. To produce a number greater than z, we must set/unset the unknown bits in t, and *all* the unknown bits of t candidates for being set/unset. We can do this similar to Case 1.1, by adding 1 to the bits extracted from the masked bit positions of z. Essentially, this case is equivalent to Case 1.1, with k = 0. t = 1x1x0xxx z = .0.1.100 + 1 --------- .0.1.101 This is the exact combination of bits needed in the unknown positions of t. After recalling the known positions of t, we get result = 10110101 (181) (Case 2) j > z t = x00010x1 z = 10000010 (130) j = 10001011 (139) ^ k Since j > z, there had to be a bit position which was 0 in z, and a 1 in j, beyond which all positions of higher significance are equal in j and z. This position had to be a 0 in z and known 1 in t. Let k be the position of the most significant 0-to-1 flip. In our example, k = 4. Because of the 0-to-1 flip at position k, a member of t can become greater than z if the bits in positions greater than k are themselves >= to z. To make that member *minimally* greater than z, the bits in positions greater than k must be exactly = z. Hence, we simply match all of t's unknown bits in positions more significant than k to z's bits. In positions less significant than k, we set all t's unknown bits to 0 to retain minimality. In our example, in positions of greater significance than k (=4), t=x000. These positions are matched with z (1000) to produce 1000. In positions of lower significance than k, t=10x1. All unknown bits are set to 0 to produce 1001. The final result is: result = 10001001 (137) This concludes the computation for a result > z that is a member of t. The procedure for tnum_step() in this commit implements the idea described above. As a proof of correctness, we verified the algorithm against a logical specification of tnum_step. The specification asserts the following about the inputs t, z and output res that: 1. res is a member of t, and 2. res is strictly greater than z, and 3. there does not exist another value res2 such that 3a. res2 is also a member of t, and 3b. res2 is greater than z 3c. res2 is smaller than res We checked the implementation against this logical specification using an SMT solver. The verification formula in SMTLIB format is available at [1]. The verification returned an "unsat": indicating that no input assignment exists for which the implementation and the specification produce different outputs. In addition, we also automatically generated the logical encoding of the C implementation using Agni [2] and verified it against the same specification. This verification also returned an "unsat", confirming that the implementation is equivalent to the specification. The formula for this check is also available at [3]. Link: https://pastebin.com/raw/2eRWbiit [1] Link: https://github.com/bpfverif/agni [2] Link: https://pastebin.com/raw/EztVbBJ2 [3] Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Link: https://lore.kernel.org/r/93fdf71910411c0f19e282ba6d03b4c65f9c5d73.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index fa4654ffb621..ca2cfec8de08 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -131,4 +131,7 @@ static inline bool tnum_subreg_is_const(struct tnum a) return !(tnum_subreg(a)).mask; } +/* Returns the smallest member of t larger than z */ +u64 tnum_step(struct tnum t, u64 z); + #endif /* _LINUX_TNUM_H */ diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 26fbfbb01700..4abc359b3db0 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -269,3 +269,59 @@ struct tnum tnum_bswap64(struct tnum a) { return TNUM(swab64(a.value), swab64(a.mask)); } + +/* Given tnum t, and a number z such that tmin <= z < tmax, where tmin + * is the smallest member of the t (= t.value) and tmax is the largest + * member of t (= t.value | t.mask), returns the smallest member of t + * larger than z. + * + * For example, + * t = x11100x0 + * z = 11110001 (241) + * result = 11110010 (242) + * + * Note: if this function is called with z >= tmax, it just returns + * early with tmax; if this function is called with z < tmin, the + * algorithm already returns tmin. + */ +u64 tnum_step(struct tnum t, u64 z) +{ + u64 tmax, j, p, q, r, s, v, u, w, res; + u8 k; + + tmax = t.value | t.mask; + + /* if z >= largest member of t, return largest member of t */ + if (z >= tmax) + return tmax; + + /* if z < smallest member of t, return smallest member of t */ + if (z < t.value) + return t.value; + + /* keep t's known bits, and match all unknown bits to z */ + j = t.value | (z & t.mask); + + if (j > z) { + p = ~z & t.value & ~t.mask; + k = fls64(p); /* k is the most-significant 0-to-1 flip */ + q = U64_MAX << k; + r = q & z; /* positions > k matched to z */ + s = ~q & t.value; /* positions <= k matched to t.value */ + v = r | s; + res = v; + } else { + p = z & ~t.value & ~t.mask; + k = fls64(p); /* k is the most-significant 1-to-0 flip */ + q = U64_MAX << k; + r = q & t.mask & z; /* unknown positions > k, matched to z */ + s = q & ~t.mask; /* known positions > k, set to 1 */ + v = r | s; + /* add 1 to unknown positions > k to make value greater than z */ + u = v + (1ULL << k); + /* extract bits in unknown positions > k from u, rest from t.value */ + w = (u & t.mask) | t.value; + res = w; + } + return res; +} From efc11a667878a1d655ff034a93a539debbfedb12 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 27 Feb 2026 22:35:02 +0100 Subject: [PATCH 34/36] bpf: Improve bounds when tnum has a single possible value We're hitting an invariant violation in Cilium that sometimes leads to BPF programs being rejected and Cilium failing to start [1]. The following extract from verifier logs shows what's happening: from 201 to 236: R1=0 R6=ctx() R7=1 R9=scalar(smin=umin=smin32=umin32=3584,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) R10=fp0 236: R1=0 R6=ctx() R7=1 R9=scalar(smin=umin=smin32=umin32=3584,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) R10=fp0 ; if (magic == MARK_MAGIC_HOST || magic == MARK_MAGIC_OVERLAY || magic == MARK_MAGIC_ENCRYPT) @ bpf_host.c:1337 236: (16) if w9 == 0xe00 goto pc+45 ; R9=scalar(smin=umin=smin32=umin32=3585,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) 237: (16) if w9 == 0xf00 goto pc+1 verifier bug: REG INVARIANTS VIOLATION (false_reg1): range bounds violation u64=[0xe01, 0xe00] s64=[0xe01, 0xe00] u32=[0xe01, 0xe00] s32=[0xe01, 0xe00] var_off=(0xe00, 0x0) We reach instruction 236 with two possible values for R9, 0xe00 and 0xf00. This is perfectly reflected in the tnum, but of course the ranges are less accurate and cover [0xe00; 0xf00]. Taking the fallthrough path at instruction 236 allows the verifier to reduce the range to [0xe01; 0xf00]. The tnum is however not updated. With these ranges, at instruction 237, the verifier is not able to deduce that R9 is always equal to 0xf00. Hence the fallthrough pass is explored first, the verifier refines the bounds using the assumption that R9 != 0xf00, and ends up with an invariant violation. This pattern of impossible branch + bounds refinement is common to all invariant violations seen so far. The long-term solution is likely to rely on the refinement + invariant violation check to detect dead branches, as started by Eduard. To fix the current issue, we need something with less refactoring that we can backport. This patch uses the tnum_step helper introduced in the previous patch to detect the above situation. In particular, three cases are now detected in the bounds refinement: 1. The u64 range and the tnum only overlap in umin. u64: ---[xxxxxx]----- tnum: --xx----------x- 2. The u64 range and the tnum only overlap in the maximum value represented by the tnum, called tmax. u64: ---[xxxxxx]----- tnum: xx-----x-------- 3. The u64 range and the tnum only overlap in between umin (excluded) and umax. u64: ---[xxxxxx]----- tnum: xx----x-------x- To detect these three cases, we call tnum_step(tnum, umin), which returns the smallest member of the tnum greater than umin, called tnum_next here. We're in case (1) if umin is part of the tnum and tnum_next is greater than umax. We're in case (2) if umin is not part of the tnum and tnum_next is equal to tmax. Finally, we're in case (3) if umin is not part of the tnum, tnum_next is inferior or equal to umax, and calling tnum_step a second time gives us a value past umax. This change implements these three cases. With it, the above bytecode looks as follows: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (47) r0 |= 3584 ; R0=scalar(smin=0x8000000000000e00,umin=umin32=3584,smin32=0x80000e00,var_off=(0xe00; 0xfffffffffffff1ff)) 2: (57) r0 &= 3840 ; R0=scalar(smin=umin=smin32=umin32=3584,smax=umax=smax32=umax32=3840,var_off=(0xe00; 0x100)) 3: (15) if r0 == 0xe00 goto pc+2 ; R0=3840 4: (15) if r0 == 0xf00 goto pc+1 4: R0=3840 6: (95) exit In addition to the new selftests, this change was also verified with Agni [3]. For the record, the raw SMT is available at [4]. The property it verifies is that: If a concrete value x is contained in all input abstract values, after __update_reg_bounds, it will continue to be contained in all output abstract values. Link: https://github.com/cilium/cilium/issues/44216 [1] Link: https://pchaigno.github.io/test-verifier-complexity.html [2] Link: https://github.com/bpfverif/agni [3] Link: https://pastebin.com/raw/naCfaqNx [4] Fixes: 0df1a55afa83 ("bpf: Warn on internal verifier errors") Acked-by: Eduard Zingerman Tested-by: Marco Schirrmeister Co-developed-by: Harishankar Vishwanathan Signed-off-by: Harishankar Vishwanathan Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/ef254c4f68be19bd393d450188946821c588565d.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb12ba020649..401d6c4960ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2379,6 +2379,9 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg) static void __update_reg64_bounds(struct bpf_reg_state *reg) { + u64 tnum_next, tmax; + bool umin_in_tnum; + /* min signed is max(sign bit) | min(other bits) */ reg->smin_value = max_t(s64, reg->smin_value, reg->var_off.value | (reg->var_off.mask & S64_MIN)); @@ -2388,6 +2391,33 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) reg->umin_value = max(reg->umin_value, reg->var_off.value); reg->umax_value = min(reg->umax_value, reg->var_off.value | reg->var_off.mask); + + /* Check if u64 and tnum overlap in a single value */ + tnum_next = tnum_step(reg->var_off, reg->umin_value); + umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tmax = reg->var_off.value | reg->var_off.mask; + if (umin_in_tnum && tnum_next > reg->umax_value) { + /* The u64 range and the tnum only overlap in umin. + * u64: ---[xxxxxx]----- + * tnum: --xx----------x- + */ + ___mark_reg_known(reg, reg->umin_value); + } else if (!umin_in_tnum && tnum_next == tmax) { + /* The u64 range and the tnum only overlap in the maximum value + * represented by the tnum, called tmax. + * u64: ---[xxxxxx]----- + * tnum: xx-----x-------- + */ + ___mark_reg_known(reg, tmax); + } else if (!umin_in_tnum && tnum_next <= reg->umax_value && + tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + /* The u64 range and the tnum only overlap in between umin + * (excluded) and umax. + * u64: ---[xxxxxx]----- + * tnum: xx----x-------x- + */ + ___mark_reg_known(reg, tnum_next); + } } static void __update_reg_bounds(struct bpf_reg_state *reg) From e6ad477d1bf8829973cddd9accbafa9d1a6cd15a Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 27 Feb 2026 22:36:30 +0100 Subject: [PATCH 35/36] selftests/bpf: Test refinement of single-value tnum This patch introduces selftests to cover the new bounds refinement logic introduced in the previous patch. Without the previous patch, the first two tests fail because of the invariant violation they trigger. The last test fails because the R10 access is not detected as dead code. In addition, all three tests fail because of R0 having a non-constant value in the verifier logs. In addition, the last two cases are covering the negative cases: when we shouldn't refine the bounds because the u64 and tnum overlap in at least two values. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/90d880c8cf587b9f7dc715d8961cd1b8111d01a8.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_bounds.c | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 560531404bce..97065a26cf70 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1863,4 +1863,141 @@ l1_%=: r0 = 1; \ : __clobber_all); } +/* This test covers the bounds deduction when the u64 range and the tnum + * overlap only at umax. After instruction 3, the ranges look as follows: + * + * 0 umin=0xe01 umax=0xf00 U64_MAX + * | [xxxxxxxxxxxxxx] | + * |----------------------------|------------------------------| + * | x x | tnum values + * + * The verifier can therefore deduce that the R0=0xf0=240. + */ +SEC("socket") +__description("bounds refinement with single-value tnum on umax") +__msg("3: (15) if r0 == 0xe0 {{.*}} R0=240") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void bounds_refinement_tnum_umax(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + r0 |= 0xe0; \ + r0 &= 0xf0; \ + if r0 == 0xe0 goto +2; \ + if r0 == 0xf0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test covers the bounds deduction when the u64 range and the tnum + * overlap only at umin. After instruction 3, the ranges look as follows: + * + * 0 umin=0xe00 umax=0xeff U64_MAX + * | [xxxxxxxxxxxxxx] | + * |----------------------------|------------------------------| + * | x x | tnum values + * + * The verifier can therefore deduce that the R0=0xe0=224. + */ +SEC("socket") +__description("bounds refinement with single-value tnum on umin") +__msg("3: (15) if r0 == 0xf0 {{.*}} R0=224") +__success __log_level(2) +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void bounds_refinement_tnum_umin(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + r0 |= 0xe0; \ + r0 &= 0xf0; \ + if r0 == 0xf0 goto +2; \ + if r0 == 0xe0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test covers the bounds deduction when the only possible tnum value is + * in the middle of the u64 range. After instruction 3, the ranges look as + * follows: + * + * 0 umin=0x7cf umax=0x7df U64_MAX + * | [xxxxxxxxxxxx] | + * |----------------------------|------------------------------| + * | x x x x x | tnum values + * | +--- 0x7e0 + * +--- 0x7d0 + * + * Since the lower four bits are zero, the tnum and the u64 range only overlap + * in R0=0x7d0=2000. Instruction 5 is therefore dead code. + */ +SEC("socket") +__description("bounds refinement with single-value tnum in middle of range") +__msg("3: (a5) if r0 < 0x7cf {{.*}} R0=2000") +__success __log_level(2) +__naked void bounds_refinement_tnum_middle(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + if r0 & 0x0f goto +4; \ + if r0 > 0x7df goto +3; \ + if r0 < 0x7cf goto +2; \ + if r0 == 0x7d0 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test cover the negative case for the tnum/u64 overlap. Since + * they contain the same two values (i.e., {0, 1}), we can't deduce + * anything more. + */ +SEC("socket") +__description("bounds refinement: several overlaps between tnum and u64") +__msg("2: (25) if r0 > 0x1 {{.*}} R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=1,var_off=(0x0; 0x1))") +__failure __log_level(2) +__naked void bounds_refinement_several_overlaps(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + if r0 < 0 goto +3; \ + if r0 > 1 goto +2; \ + if r0 == 1 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* This test cover the negative case for the tnum/u64 overlap. Since + * they overlap in the two values contained by the u64 range (i.e., + * {0xf, 0x10}), we can't deduce anything more. + */ +SEC("socket") +__description("bounds refinement: multiple overlaps between tnum and u64") +__msg("2: (25) if r0 > 0x10 {{.*}} R0=scalar(smin=umin=smin32=umin32=15,smax=umax=smax32=umax32=16,var_off=(0x0; 0x1f))") +__failure __log_level(2) +__naked void bounds_refinement_multiple_overlaps(void *ctx) +{ + asm volatile(" \ + call %[bpf_get_prandom_u32]; \ + if r0 < 0xf goto +3; \ + if r0 > 0x10 goto +2; \ + if r0 == 0x10 goto +1; \ + r10 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 024cea2d647ed8ab942f19544b892d324dba42b4 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 27 Feb 2026 22:42:45 +0100 Subject: [PATCH 36/36] selftests/bpf: Avoid simplification of crafted bounds test The reg_bounds_crafted tests validate the verifier's range analysis logic. They focus on the actual ranges and thus ignore the tnum. As a consequence, they carry the assumption that the tested cases can be reproduced in userspace without using the tnum information. Unfortunately, the previous change the refinement logic breaks that assumption for one test case: (u64)2147483648 (u32) [4294967294; 0x100000000] The tested bytecode is shown below. Without our previous improvement, on the false branch of the condition, R7 is only known to have u64 range [0xfffffffe; 0x100000000]. With our improvement, and using the tnum information, we can deduce that R7 equals 0x100000000. 19: (bc) w0 = w6 ; R6=0x80000000 20: (bc) w0 = w7 ; R7=scalar(smin=umin=0xfffffffe,smax=umax=0x100000000,smin32=-2,smax32=0,var_off=(0x0; 0x1ffffffff)) 21: (be) if w6 <= w7 goto pc+3 ; R6=0x80000000 R7=0x100000000 R7's tnum is (0; 0x1ffffffff). On the false branch, regs_refine_cond_op refines R7's u32 range to [0; 0x7fffffff]. Then, __reg32_deduce_bounds refines the s32 range to 0 using u32 and finally also sets u32=0. From this, __reg_bound_offset improves the tnum to (0; 0x100000000). Finally, our previous patch uses this new tnum to deduce that it only intersect with u64=[0xfffffffe; 0x100000000] in a single value: 0x100000000. Because the verifier uses the tnum to reach this constant value, the selftest is unable to reproduce it by only simulating ranges. The solution implemented in this patch is to change the test case such that there is more than one overlap value between u64 and the tnum. The max. u64 value is thus changed from 0x100000000 to 0x300000000. Acked-by: Eduard Zingerman Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/50641c6a7ef39520595dcafa605692427c1006ec.1772225741.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/reg_bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index d93a0c7b1786..0322f817d07b 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -2091,7 +2091,7 @@ static struct subtest_case crafted_cases[] = { {U64, S64, {0, 0xffffffffULL}, {0x7fffffff, 0x7fffffff}}, {U64, U32, {0, 0x100000000}, {0, 0}}, - {U64, U32, {0xfffffffe, 0x100000000}, {0x80000000, 0x80000000}}, + {U64, U32, {0xfffffffe, 0x300000000}, {0x80000000, 0x80000000}}, {U64, S32, {0, 0xffffffff00000000ULL}, {0, 0}}, /* these are tricky cases where lower 32 bits allow to tighten 64