Merge branch 'mptcp-pm-prep-work-for-new-ops-and-sysctl-knobs'

Matthieu Baerts says:

====================
mptcp: pm: prep work for new ops and sysctl knobs

Here are a few cleanups, preparation work for the new PM ops, and sysctl
knobs.

- Patch 1: reorg: move generic NL code used by all PMs to pm_netlink.c.

- Patch 2: use kmemdup() instead of kmalloc + copy.

- Patch 3: small cleanup to use pm var instead of msk->pm.

- Patch 4: reorg: id_avail_bitmap is only used by the in-kernel PM.

- Patch 5: use struct_group to easily reset a subset of PM data vars.

- Patch 6: introduce the minimal skeleton for the new PM ops.

- Patch 7: register in-kernel and userspace PM ops.

- Patch 8: new net.mptcp.path_manager sysctl knob, deprecating pm_type.

- Patch 9: map the new path_manager sysctl knob with pm_type.

- Patch 10: map the old pm_type sysctl knob with path_manager.

- Patch 11: new net.mptcp.available_path_managers sysctl knob.

- Patch 12: new test to validate path_manager and pm_type mapping.

Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
====================

Link: https://patch.msgid.link/20250313-net-next-mptcp-pm-ops-intro-v1-0-f4e4a88efc50@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni
2025-03-20 10:14:52 +01:00
9 changed files with 301 additions and 25 deletions

View File

@@ -30,6 +30,10 @@ allow_join_initial_addr_port - BOOLEAN
Default: 1
available_path_managers - STRING
Shows the available path managers choices that are registered. More
path managers may be available, but not loaded.
available_schedulers - STRING
Shows the available schedulers choices that are registered. More packet
schedulers may be available, but not loaded.
@@ -72,6 +76,23 @@ enabled - BOOLEAN
Default: 1 (enabled)
path_manager - STRING
Set the default path manager name to use for each new MPTCP
socket. In-kernel path management will control subflow
connections and address advertisements according to
per-namespace values configured over the MPTCP netlink
API. Userspace path management puts per-MPTCP-connection subflow
connection decisions and address advertisements under control of
a privileged userspace program, at the cost of more netlink
traffic to propagate all of the related events and commands.
This is a per-namespace sysctl.
* "kernel" - In-kernel path manager
* "userspace" - Userspace path manager
Default: "kernel"
pm_type - INTEGER
Set the default path manager type to use for each new MPTCP
socket. In-kernel path management will control subflow
@@ -84,6 +105,8 @@ pm_type - INTEGER
This is a per-namespace sysctl.
Deprecated since v6.15, use path_manager instead.
* 0 - In-kernel path manager
* 1 - Userspace path manager

View File

@@ -14,6 +14,7 @@
struct mptcp_info;
struct mptcp_sock;
struct mptcp_pm_addr_entry;
struct seq_file;
/* MPTCP sk_buff extension data */
@@ -121,6 +122,19 @@ struct mptcp_sched_ops {
void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;
#define MPTCP_PM_NAME_MAX 16
#define MPTCP_PM_MAX 128
#define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX)
struct mptcp_pm_ops {
char name[MPTCP_PM_NAME_MAX];
struct module *owner;
struct list_head list;
void (*init)(struct mptcp_sock *msk);
void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;
#ifdef CONFIG_MPTCP
void mptcp_init(void);

View File

@@ -39,6 +39,7 @@ struct mptcp_pernet {
u8 allow_join_initial_addr_port;
u8 pm_type;
char scheduler[MPTCP_SCHED_NAME_MAX];
char path_manager[MPTCP_PM_NAME_MAX];
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
@@ -83,6 +84,11 @@ int mptcp_get_pm_type(const struct net *net)
return mptcp_get_pernet(net)->pm_type;
}
const char *mptcp_get_path_manager(const struct net *net)
{
return mptcp_get_pernet(net)->path_manager;
}
const char *mptcp_get_scheduler(const struct net *net)
{
return mptcp_get_pernet(net)->scheduler;
@@ -101,6 +107,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler));
strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager));
}
#ifdef CONFIG_SYSCTL
@@ -174,6 +181,96 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table,
return ret;
}
static int mptcp_set_path_manager(char *path_manager, const char *name)
{
struct mptcp_pm_ops *pm_ops;
int ret = 0;
rcu_read_lock();
pm_ops = mptcp_pm_find(name);
if (pm_ops)
strscpy(path_manager, name, MPTCP_PM_NAME_MAX);
else
ret = -ENOENT;
rcu_read_unlock();
return ret;
}
static int proc_path_manager(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct mptcp_pernet *pernet = container_of(ctl->data,
struct mptcp_pernet,
path_manager);
char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data;
char pm_name[MPTCP_PM_NAME_MAX];
const struct ctl_table tbl = {
.data = pm_name,
.maxlen = MPTCP_PM_NAME_MAX,
};
int ret;
strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX);
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0) {
ret = mptcp_set_path_manager(*path_manager, pm_name);
if (ret == 0) {
u8 pm_type = __MPTCP_PM_TYPE_NR;
if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0)
pm_type = MPTCP_PM_TYPE_KERNEL;
else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0)
pm_type = MPTCP_PM_TYPE_USERSPACE;
pernet->pm_type = pm_type;
}
}
return ret;
}
static int proc_pm_type(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct mptcp_pernet *pernet = container_of(ctl->data,
struct mptcp_pernet,
pm_type);
int ret;
ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos);
if (write && ret == 0) {
u8 pm_type = READ_ONCE(*(u8 *)ctl->data);
char *pm_name = "";
if (pm_type == MPTCP_PM_TYPE_KERNEL)
pm_name = "kernel";
else if (pm_type == MPTCP_PM_TYPE_USERSPACE)
pm_name = "userspace";
mptcp_set_path_manager(pernet->path_manager, pm_name);
}
return ret;
}
static int proc_available_path_managers(const struct ctl_table *ctl,
int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, };
int ret;
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
if (!tbl.data)
return -ENOMEM;
mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX);
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
kfree(tbl.data);
return ret;
}
static struct ctl_table mptcp_sysctl_table[] = {
{
.procname = "enabled",
@@ -218,7 +315,7 @@ static struct ctl_table mptcp_sysctl_table[] = {
.procname = "pm_type",
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.proc_handler = proc_pm_type,
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
@@ -253,6 +350,18 @@ static struct ctl_table mptcp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
},
{
.procname = "path_manager",
.maxlen = MPTCP_PM_NAME_MAX,
.mode = 0644,
.proc_handler = proc_path_manager,
},
{
.procname = "available_path_managers",
.maxlen = MPTCP_PM_BUF_MAX,
.mode = 0444,
.proc_handler = proc_available_path_managers,
},
};
static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
@@ -278,6 +387,8 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[8].data = &pernet->close_timeout;
table[9].data = &pernet->blackhole_timeout;
table[10].data = &pernet->syn_retrans_before_tcp_fallback;
table[11].data = &pernet->path_manager;
/* table[12] is for available_path_managers which is read-only info */
hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
ARRAY_SIZE(mptcp_sysctl_table));

View File

@@ -5,6 +5,8 @@
*/
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include "protocol.h"
#include "mib.h"
@@ -18,6 +20,9 @@ struct mptcp_pm_add_entry {
struct mptcp_sock *sock;
};
static DEFINE_SPINLOCK(mptcp_pm_list_lock);
static LIST_HEAD(mptcp_pm_list);
/* path manager helpers */
/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
@@ -511,13 +516,13 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk)
* be sure to serve this event only once.
*/
if (READ_ONCE(pm->work_pending) &&
!(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
!(pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
if ((pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
announce = true;
msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
pm->status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
spin_unlock_bh(&pm->lock);
if (announce)
@@ -978,10 +983,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk)
u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
struct mptcp_pm_data *pm = &msk->pm;
pm->add_addr_signaled = 0;
pm->add_addr_accepted = 0;
pm->local_addr_used = 0;
pm->subflows = 0;
memset(&pm->reset, 0, sizeof(pm->reset));
pm->rm_list_tx.nr = 0;
pm->rm_list_rx.nr = 0;
WRITE_ONCE(pm->pm_type, pm_type);
@@ -1000,16 +1002,9 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk)
!!mptcp_pm_get_add_addr_accept_max(msk) &&
subflows_allowed);
WRITE_ONCE(pm->accept_subflow, subflows_allowed);
} else {
WRITE_ONCE(pm->work_pending, 0);
WRITE_ONCE(pm->accept_addr, 0);
WRITE_ONCE(pm->accept_subflow, 0);
}
WRITE_ONCE(pm->addr_signal, 0);
WRITE_ONCE(pm->remote_deny_join_id0, false);
pm->status = 0;
bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
bitmap_fill(pm->id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
}
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
@@ -1022,5 +1017,75 @@ void mptcp_pm_data_init(struct mptcp_sock *msk)
void __init mptcp_pm_init(void)
{
mptcp_pm_kernel_register();
mptcp_pm_userspace_register();
mptcp_pm_nl_init();
}
/* Must be called with rcu read lock held */
struct mptcp_pm_ops *mptcp_pm_find(const char *name)
{
struct mptcp_pm_ops *pm_ops;
list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) {
if (!strcmp(pm_ops->name, name))
return pm_ops;
}
return NULL;
}
int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops)
{
return 0;
}
int mptcp_pm_register(struct mptcp_pm_ops *pm_ops)
{
int ret;
ret = mptcp_pm_validate(pm_ops);
if (ret)
return ret;
spin_lock(&mptcp_pm_list_lock);
if (mptcp_pm_find(pm_ops->name)) {
spin_unlock(&mptcp_pm_list_lock);
return -EEXIST;
}
list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list);
spin_unlock(&mptcp_pm_list_lock);
pr_debug("%s registered\n", pm_ops->name);
return 0;
}
void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops)
{
/* skip unregistering the default path manager */
if (WARN_ON_ONCE(pm_ops == &mptcp_pm_kernel))
return;
spin_lock(&mptcp_pm_list_lock);
list_del_rcu(&pm_ops->list);
spin_unlock(&mptcp_pm_list_lock);
}
/* Build string with list of available path manager values.
* Similar to tcp_get_available_congestion_control()
*/
void mptcp_pm_get_available(char *buf, size_t maxlen)
{
struct mptcp_pm_ops *pm_ops;
size_t offs = 0;
rcu_read_lock();
list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) {
offs += snprintf(buf + offs, maxlen - offs, "%s%s",
offs == 0 ? "" : " ", pm_ops->name);
if (WARN_ON_ONCE(offs >= maxlen))
break;
}
rcu_read_unlock();
}

View File

@@ -710,11 +710,10 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk,
return ret;
/* address not found, add to local list */
entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
entry = kmemdup(skc, sizeof(*skc), GFP_ATOMIC);
if (!entry)
return -ENOMEM;
*entry = *skc;
entry->addr.port = 0;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false);
if (ret < 0)
@@ -817,13 +816,12 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT);
entry = kmemdup(&addr, sizeof(addr), GFP_KERNEL_ACCOUNT);
if (!entry) {
GENL_SET_ERR_MSG(info, "can't allocate addr");
return -ENOMEM;
}
*entry = addr;
if (entry->addr.port) {
ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
if (ret) {
@@ -1400,11 +1398,15 @@ static struct pernet_operations mptcp_pm_pernet_ops = {
.size = sizeof(struct pm_nl_pernet),
};
void __init mptcp_pm_nl_init(void)
struct mptcp_pm_ops mptcp_pm_kernel = {
.name = "kernel",
.owner = THIS_MODULE,
};
void __init mptcp_pm_kernel_register(void)
{
if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
panic("Failed to register MPTCP PM pernet subsystem.\n");
if (genl_register_family(&mptcp_genl_family))
panic("Failed to register MPTCP PM netlink family\n");
mptcp_pm_register(&mptcp_pm_kernel);
}

View File

@@ -625,3 +625,9 @@ struct genl_family mptcp_genl_family __ro_after_init = {
.mcgrps = mptcp_pm_mcgrps,
.n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
};
void __init mptcp_pm_nl_init(void)
{
if (genl_register_family(&mptcp_genl_family))
panic("Failed to register MPTCP PM netlink family\n");
}

View File

@@ -682,3 +682,13 @@ int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr,
sock_put(sk);
return ret;
}
static struct mptcp_pm_ops mptcp_pm_userspace = {
.name = "userspace",
.owner = THIS_MODULE,
};
void __init mptcp_pm_userspace_register(void)
{
mptcp_pm_register(&mptcp_pm_userspace);
}

View File

@@ -223,6 +223,8 @@ struct mptcp_pm_data {
spinlock_t lock; /*protects the whole PM data */
struct_group(reset,
u8 addr_signal;
bool server_side;
bool work_pending;
@@ -235,6 +237,9 @@ struct mptcp_pm_data {
u8 pm_type;
u8 subflows;
u8 status;
);
DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
struct mptcp_rm_list rm_list_tx;
struct mptcp_rm_list rm_list_rx;
@@ -694,6 +699,7 @@ int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
unsigned int mptcp_close_timeout(const struct sock *sk);
int mptcp_get_pm_type(const struct net *net);
const char *mptcp_get_path_manager(const struct net *net);
const char *mptcp_get_scheduler(const struct net *net);
void mptcp_active_disable(struct sock *sk);
@@ -1045,6 +1051,15 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_
void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk,
struct mptcp_pm_addr_entry *entry);
/* the default path manager, used in mptcp_pm_unregister */
extern struct mptcp_pm_ops mptcp_pm_kernel;
struct mptcp_pm_ops *mptcp_pm_find(const char *name);
int mptcp_pm_register(struct mptcp_pm_ops *pm_ops);
void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops);
int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops);
void mptcp_pm_get_available(char *buf, size_t maxlen);
void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk);
void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk,
@@ -1147,6 +1162,8 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo
return local_id;
}
void __init mptcp_pm_kernel_register(void);
void __init mptcp_pm_userspace_register(void);
void __init mptcp_pm_nl_init(void);
void mptcp_pm_worker(struct mptcp_sock *msk);
void __mptcp_pm_kernel_worker(struct mptcp_sock *msk);

View File

@@ -117,7 +117,36 @@ cleanup()
trap cleanup EXIT
# Create and configure network namespaces for testing
print_title "Init"
mptcp_lib_ns_init ns1 ns2
# check path_manager and pm_type sysctl mapping
if [ -f /proc/sys/net/mptcp/path_manager ]; then
ip netns exec "$ns1" sysctl -q net.mptcp.path_manager=userspace
pm_type="$(ip netns exec "$ns1" sysctl -n net.mptcp.pm_type)"
if [ "${pm_type}" != "1" ]; then
test_fail "unexpected pm_type: ${pm_type}"
mptcp_lib_result_print_all_tap
exit ${KSFT_FAIL}
fi
ip netns exec "$ns1" sysctl -q net.mptcp.path_manager=error 2>/dev/null
pm_type="$(ip netns exec "$ns1" sysctl -n net.mptcp.pm_type)"
if [ "${pm_type}" != "1" ]; then
test_fail "unexpected pm_type after error: ${pm_type}"
mptcp_lib_result_print_all_tap
exit ${KSFT_FAIL}
fi
ip netns exec "$ns1" sysctl -q net.mptcp.pm_type=0
pm_name="$(ip netns exec "$ns1" sysctl -n net.mptcp.path_manager)"
if [ "${pm_name}" != "kernel" ]; then
test_fail "unexpected path-manager: ${pm_name}"
mptcp_lib_result_print_all_tap
exit ${KSFT_FAIL}
fi
fi
for i in "$ns1" "$ns2" ;do
ip netns exec "$i" sysctl -q net.mptcp.pm_type=1
done
@@ -152,7 +181,6 @@ mptcp_lib_events "${ns1}" "${server_evts}" server_evts_pid
sleep 0.5
mptcp_lib_subtests_last_ts_reset
print_title "Init"
print_test "Created network namespaces ns1, ns2"
test_pass