2
0
mirror of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-09-04 20:19:47 +08:00

net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw

Introduce support for managing Traffic Class (TC) arbiter nodes and
associated vports TC nodes within the E-Switch QoS hierarchy. This
patch adds support for the new scheduling node type,
`SCHED_NODE_TYPE_VPORTS_TC_TSAR`, and implements full support for
setting tc-bw on both vports and nodes.

Key changes include:

- Introduced the new scheduling node type,
  `SCHED_NODE_TYPE_VPORTS_TC_TSAR`, for managing vports within the TC
  arbiter node.

- New helper functions for creating and destroying vports TC nodes
  under the TC arbiter.

- Updated the minimum rate normalization function to skip nodes of type
  `SCHED_NODE_TYPE_VPORTS_TC_TSAR`. Vports TC TSARs have bandwidth
  shares configured on them but not minimum rates, so their `min_rate`
  cannot be normalized.

- Implementation of `esw_qos_tc_arbiter_scheduling_setup()` and
  `esw_qos_tc_arbiter_scheduling_teardown()` for initializing and
  cleaning up TC arbiter scheduling elements. These functions now fully
  support tc-bw configuration on TC arbiter nodes.

- Introduced a new helper `esw_qos_calculate_tc_bw_divider()` to
  compute the total TC bandwidth share, which is used as a divider for
  normalizing each TC's share.

- Added `esw_qos_tc_arbiter_get_bw_shares()` and
  `esw_qos_set_tc_arbiter_bw_shares()` to handle the settings of
  bandwidth shares for vports traffic class TSARs.

- `esw_qos_set_tc_arbiter_bw_shares()` normalizes  each TC share based
  on the total and the firmware's maximum allowed TSAR bandwidth share.

- Refactored `mlx5_esw_devlink_rate_node_tc_bw_set()` and
  `mlx5_esw_devlink_rate_leaf_tc_bw_set()` to fully support configuring
  tc-bw on devlink rate nodes and vports, respectively.

- Refactored `mlx5_esw_qos_node_update_parent()` to ensure that tc-bw
  configuration remains compatible with setting a parent on a rate
  node, preserving level hierarchy functionality.

- Refactored `esw_qos_calc_bw_share()` to generalize its input so it
  can be used for both minimum rate and bandwidth share calculations.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250629142138.361537-8-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Carolina Jubran 2025-06-29 17:21:37 +03:00 committed by Jakub Kicinski
parent 97733d1e00
commit cf7e73770d

View File

@ -67,6 +67,7 @@ enum sched_node_type {
SCHED_NODE_TYPE_TC_ARBITER_TSAR,
SCHED_NODE_TYPE_RATE_LIMITER,
SCHED_NODE_TYPE_VPORT_TC,
SCHED_NODE_TYPE_VPORTS_TC_TSAR,
};
static const char * const sched_node_type_str[] = {
@ -75,6 +76,7 @@ static const char * const sched_node_type_str[] = {
[SCHED_NODE_TYPE_TC_ARBITER_TSAR] = "TC Arbiter TSAR",
[SCHED_NODE_TYPE_RATE_LIMITER] = "Rate Limiter",
[SCHED_NODE_TYPE_VPORT_TC] = "vport TC",
[SCHED_NODE_TYPE_VPORTS_TC_TSAR] = "vports TC TSAR",
};
struct mlx5_esw_sched_node {
@ -187,6 +189,11 @@ mlx5_esw_qos_vport_get_parent(const struct mlx5_vport *vport)
static void esw_qos_sched_elem_warn(struct mlx5_esw_sched_node *node, int err, const char *op)
{
switch (node->type) {
case SCHED_NODE_TYPE_VPORTS_TC_TSAR:
esw_warn(node->esw->dev,
"E-Switch %s %s scheduling element failed (tc=%d,err=%d)\n",
op, sched_node_type_str[node->type], node->tc, err);
break;
case SCHED_NODE_TYPE_VPORT_TC:
esw_warn(node->esw->dev,
"E-Switch %s %s scheduling element failed (vport=%d,tc=%d,err=%d)\n",
@ -345,11 +352,13 @@ static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
return 0;
}
static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max)
static u32 esw_qos_calc_bw_share(u32 value, u32 divider, u32 fw_max)
{
if (!divider)
return 0;
return min_t(u32, max_t(u32, DIV_ROUND_UP(min_rate, divider), MLX5_MIN_BW_SHARE), fw_max);
return min_t(u32, fw_max,
max_t(u32,
DIV_ROUND_UP(value, divider), MLX5_MIN_BW_SHARE));
}
static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node,
@ -376,7 +385,13 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
if (node->esw != esw || node->ix == esw->qos.root_tsar_ix)
continue;
esw_qos_update_sched_node_bw_share(node, divider, extack);
/* Vports TC TSARs don't have a minimum rate configured,
* so there's no need to update the bw_share on them.
*/
if (node->type != SCHED_NODE_TYPE_VPORTS_TC_TSAR) {
esw_qos_update_sched_node_bw_share(node, divider,
extack);
}
if (list_empty(&node->children))
continue;
@ -385,6 +400,20 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
}
}
static u32 esw_qos_calculate_tc_bw_divider(u32 *tc_bw)
{
u32 total = 0;
int i;
for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++)
total += tc_bw[i];
/* If total is zero, tc-bw config is disabled and we shouldn't reach
* here.
*/
return WARN_ON(!total) ? 1 : total;
}
static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
u32 min_rate, struct netlink_ext_ack *extack)
{
@ -527,6 +556,149 @@ static void esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netlin
__esw_qos_free_node(node);
}
static int esw_qos_create_vports_tc_node(struct mlx5_esw_sched_node *parent,
u8 tc, struct netlink_ext_ack *extack)
{
u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
struct mlx5_core_dev *dev = parent->esw->dev;
struct mlx5_esw_sched_node *vports_tc_node;
void *attr;
int err;
if (!mlx5_qos_element_type_supported(
dev,
SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR,
SCHEDULING_HIERARCHY_E_SWITCH) ||
!mlx5_qos_tsar_type_supported(dev,
TSAR_ELEMENT_TSAR_TYPE_DWRR,
SCHEDULING_HIERARCHY_E_SWITCH))
return -EOPNOTSUPP;
vports_tc_node = __esw_qos_alloc_node(parent->esw, 0,
SCHED_NODE_TYPE_VPORTS_TC_TSAR,
parent);
if (!vports_tc_node) {
NL_SET_ERR_MSG_MOD(extack, "E-Switch alloc node failed");
esw_warn(dev, "Failed to alloc vports TC node (tc=%d)\n", tc);
return -ENOMEM;
}
attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_DWRR);
MLX5_SET(tsar_element, attr, traffic_class, tc);
MLX5_SET(scheduling_context, tsar_ctx, parent_element_id, parent->ix);
MLX5_SET(scheduling_context, tsar_ctx, element_type,
SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
err = esw_qos_node_create_sched_element(vports_tc_node, tsar_ctx,
extack);
if (err)
goto err_create_sched_element;
vports_tc_node->tc = tc;
return 0;
err_create_sched_element:
__esw_qos_free_node(vports_tc_node);
return err;
}
static void
esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
u32 *tc_bw)
{
struct mlx5_esw_sched_node *vports_tc_node;
list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry)
tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share;
}
static void
esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node,
u32 *tc_bw, struct netlink_ext_ack *extack)
{
struct mlx5_eswitch *esw = tc_arbiter_node->esw;
struct mlx5_esw_sched_node *vports_tc_node;
u32 divider, fw_max_bw_share;
fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
divider = esw_qos_calculate_tc_bw_divider(tc_bw);
list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) {
u8 tc = vports_tc_node->tc;
u32 bw_share;
bw_share = tc_bw[tc] * fw_max_bw_share;
bw_share = esw_qos_calc_bw_share(bw_share, divider,
fw_max_bw_share);
esw_qos_sched_elem_config(vports_tc_node, 0, bw_share, extack);
}
}
static void
esw_qos_destroy_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node,
struct netlink_ext_ack *extack)
{
struct mlx5_esw_sched_node *vports_tc_node, *tmp;
list_for_each_entry_safe(vports_tc_node, tmp,
&tc_arbiter_node->children, entry)
esw_qos_destroy_node(vports_tc_node, extack);
}
static int
esw_qos_create_vports_tc_nodes(struct mlx5_esw_sched_node *tc_arbiter_node,
struct netlink_ext_ack *extack)
{
struct mlx5_eswitch *esw = tc_arbiter_node->esw;
int err, i, num_tcs = esw_qos_num_tcs(esw->dev);
for (i = 0; i < num_tcs; i++) {
err = esw_qos_create_vports_tc_node(tc_arbiter_node, i, extack);
if (err)
goto err_tc_node_create;
}
return 0;
err_tc_node_create:
esw_qos_destroy_vports_tc_nodes(tc_arbiter_node, NULL);
return err;
}
static int esw_qos_create_tc_arbiter_sched_elem(
struct mlx5_esw_sched_node *tc_arbiter_node,
struct netlink_ext_ack *extack)
{
u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
u32 tsar_parent_ix;
void *attr;
if (!mlx5_qos_tsar_type_supported(tc_arbiter_node->esw->dev,
TSAR_ELEMENT_TSAR_TYPE_TC_ARB,
SCHEDULING_HIERARCHY_E_SWITCH)) {
NL_SET_ERR_MSG_MOD(extack,
"E-Switch TC Arbiter scheduling element is not supported");
return -EOPNOTSUPP;
}
attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_TC_ARB);
tsar_parent_ix = tc_arbiter_node->parent ? tc_arbiter_node->parent->ix :
tc_arbiter_node->esw->qos.root_tsar_ix;
MLX5_SET(scheduling_context, tsar_ctx, parent_element_id,
tsar_parent_ix);
MLX5_SET(scheduling_context, tsar_ctx, element_type,
SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
MLX5_SET(scheduling_context, tsar_ctx, max_average_bw,
tc_arbiter_node->max_rate);
MLX5_SET(scheduling_context, tsar_ctx, bw_share,
tc_arbiter_node->bw_share);
return esw_qos_node_create_sched_element(tc_arbiter_node, tsar_ctx,
extack);
}
static struct mlx5_esw_sched_node *
__esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent,
struct netlink_ext_ack *extack)
@ -591,6 +763,9 @@ static void __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netl
{
struct mlx5_eswitch *esw = node->esw;
if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR)
esw_qos_destroy_vports_tc_nodes(node, extack);
trace_mlx5_esw_node_qos_destroy(esw->dev, node, node->ix);
esw_qos_destroy_node(node, extack);
esw_qos_normalize_min_rate(esw, NULL, extack);
@ -685,13 +860,38 @@ static void esw_qos_put(struct mlx5_eswitch *esw)
static void
esw_qos_tc_arbiter_scheduling_teardown(struct mlx5_esw_sched_node *node,
struct netlink_ext_ack *extack)
{}
{
/* Clean up all Vports TC nodes within the TC arbiter node. */
esw_qos_destroy_vports_tc_nodes(node, extack);
/* Destroy the scheduling element for the TC arbiter node itself. */
esw_qos_node_destroy_sched_element(node, extack);
}
static int esw_qos_tc_arbiter_scheduling_setup(struct mlx5_esw_sched_node *node,
struct netlink_ext_ack *extack)
{
NL_SET_ERR_MSG_MOD(extack, "TC arbiter elements are not supported.");
return -EOPNOTSUPP;
u32 curr_ix = node->ix;
int err;
err = esw_qos_create_tc_arbiter_sched_elem(node, extack);
if (err)
return err;
/* Initialize the vports TC nodes within created TC arbiter TSAR. */
err = esw_qos_create_vports_tc_nodes(node, extack);
if (err)
goto err_vports_tc_nodes;
node->type = SCHED_NODE_TYPE_TC_ARBITER_TSAR;
return 0;
err_vports_tc_nodes:
/* If initialization fails, clean up the scheduling element
* for the TC arbiter node.
*/
esw_qos_node_destroy_sched_element(node, NULL);
node->ix = curr_ix;
return err;
}
static int
@ -1064,6 +1264,7 @@ static int esw_qos_vport_update(struct mlx5_vport *vport,
{
struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent;
enum sched_node_type curr_type = vport->qos.sched_node->type;
u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0};
int err;
esw_assert_qos_lock_held(vport->dev->priv.eswitch);
@ -1075,11 +1276,23 @@ static int esw_qos_vport_update(struct mlx5_vport *vport,
if (err)
return err;
if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node,
curr_tc_bw);
}
esw_qos_vport_disable(vport, extack);
err = esw_qos_vport_enable(vport, type, parent, extack);
if (err)
if (err) {
esw_qos_vport_enable(vport, curr_type, curr_parent, NULL);
extack = NULL;
}
if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) {
esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node,
curr_tc_bw, extack);
}
return err;
}
@ -1563,6 +1776,8 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf,
SCHED_NODE_TYPE_TC_ARBITER_TSAR,
NULL, extack);
}
if (!err)
esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack);
unlock:
esw_qos_unlock(esw);
return err;
@ -1592,6 +1807,8 @@ int mlx5_esw_devlink_rate_node_tc_bw_set(struct devlink_rate *rate_node,
}
err = esw_qos_node_enable_tc_arbitration(node, extack);
if (!err)
esw_qos_set_tc_arbiter_bw_shares(node, tc_bw, extack);
unlock:
esw_qos_unlock(esw);
return err;
@ -1716,6 +1933,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate,
return mlx5_esw_qos_vport_update_parent(vport, node, extack);
}
static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node)
{
if (list_empty(&node->children))
return true;
if (node->type != SCHED_NODE_TYPE_TC_ARBITER_TSAR)
return false;
node = list_first_entry(&node->children, struct mlx5_esw_sched_node,
entry);
return esw_qos_is_node_empty(node);
}
static int
mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
struct mlx5_esw_sched_node *parent,
@ -1729,13 +1960,26 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
return -EOPNOTSUPP;
}
if (!list_empty(&node->children)) {
if (!esw_qos_is_node_empty(node)) {
NL_SET_ERR_MSG_MOD(extack,
"Cannot reassign a node that contains rate objects");
return -EOPNOTSUPP;
}
if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
NL_SET_ERR_MSG_MOD(extack,
"Cannot attach a node to a parent with TC bandwidth configured");
return -EOPNOTSUPP;
}
new_level = parent ? parent->level + 1 : 2;
if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
/* Increase by one to account for the vports TC scheduling
* element.
*/
new_level += 1;
}
max_level = 1 << MLX5_CAP_QOS(node->esw->dev, log_esw_max_sched_depth);
if (new_level > max_level) {
NL_SET_ERR_MSG_MOD(extack,
@ -1746,6 +1990,32 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
return 0;
}
static int
esw_qos_tc_arbiter_node_update_parent(struct mlx5_esw_sched_node *node,
struct mlx5_esw_sched_node *parent,
struct netlink_ext_ack *extack)
{
struct mlx5_esw_sched_node *curr_parent = node->parent;
u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0};
struct mlx5_eswitch *esw = node->esw;
int err;
esw_qos_tc_arbiter_get_bw_shares(node, curr_tc_bw);
esw_qos_tc_arbiter_scheduling_teardown(node, extack);
esw_qos_node_set_parent(node, parent);
err = esw_qos_tc_arbiter_scheduling_setup(node, extack);
if (err) {
esw_qos_node_set_parent(node, curr_parent);
if (esw_qos_tc_arbiter_scheduling_setup(node, extack)) {
esw_warn(esw->dev, "Node restore QoS failed\n");
return err;
}
}
esw_qos_set_tc_arbiter_bw_shares(node, curr_tc_bw, extack);
return err;
}
static int esw_qos_vports_node_update_parent(struct mlx5_esw_sched_node *node,
struct mlx5_esw_sched_node *parent,
struct netlink_ext_ack *extack)
@ -1791,7 +2061,13 @@ static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node,
esw_qos_lock(esw);
curr_parent = node->parent;
err = esw_qos_vports_node_update_parent(node, parent, extack);
if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
err = esw_qos_tc_arbiter_node_update_parent(node, parent,
extack);
} else {
err = esw_qos_vports_node_update_parent(node, parent, extack);
}
if (err)
goto out;