Merge branch 'pci/bwctrl'

- Add read/modify/write locking for Link Control 2, which is used to manage
  Link speed (Ilpo Järvinen)

- Cache all supported Link speeds for use by the PCIe bandwidth controller
  (Ilpo Järvinen)

- Extract the Link Bandwidth Management Status check into pcie_lbms_seen(),
  where it can be shared between the bandwidth controller and quirks that
  use it to help retrain failed links (Ilpo Järvinen)

- Re-add Link Bandwidth notification support with updates to address the
  reasons it was previously reverted (Alexandru Gagniuc, Ilpo Järvinen)

- Add pcie_set_target_speed() and related functionality to manage PCIe Link
  speed based on thermal constraints (Ilpo Järvinen)

- Add a thermal cooling driver to throttle PCIe Links via the existing
  thermal management framework (Ilpo Järvinen)

- Add a userspace selftest for the PCIe bandwidth controller (Ilpo
  Järvinen)

- Drop duplicate pcie_get_speed_cap(), pcie_get_width_cap() declarations
  (Bjorn Helgaas)

* pci/bwctrl:
  PCI: Drop duplicate pcie_get_speed_cap(), pcie_get_width_cap() declarations
  selftests/pcie_bwctrl: Create selftests
  thermal: Add PCIe cooling driver
  PCI/bwctrl: Add pcie_set_target_speed() to set PCIe Link Speed
  PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller
  PCI: Abstract LBMS seen check into pcie_lbms_seen()
  PCI: Refactor pcie_update_link_speed()
  PCI: Store all PCIe Supported Link Speeds
  PCI: Protect Link Control 2 Register with RMW locking
  Documentation PCI: Reformat RMW ops documentation
This commit is contained in:
Bjorn Helgaas
2024-11-25 13:40:43 -06:00
22 changed files with 852 additions and 64 deletions

View File

@@ -217,8 +217,12 @@ capability structure except the PCI Express capability structure,
that is shared between many drivers including the service drivers.
RMW Capability accessors (pcie_capability_clear_and_set_word(),
pcie_capability_set_word(), and pcie_capability_clear_word()) protect
a selected set of PCI Express Capability Registers (Link Control
Register and Root Control Register). Any change to those registers
should be performed using RMW accessors to avoid problems due to
concurrent updates. For the up-to-date list of protected registers,
see pcie_capability_clear_and_set_word().
a selected set of PCI Express Capability Registers:
* Link Control Register
* Root Control Register
* Link Control 2 Register
Any change to those registers should be performed using RMW accessors to
avoid problems due to concurrent updates. For the up-to-date list of
protected registers, see pcie_capability_clear_and_set_word().

View File

@@ -17933,6 +17933,15 @@ F: include/linux/of_pci.h
F: include/linux/pci*
F: include/uapi/linux/pci*
PCIE BANDWIDTH CONTROLLER
M: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
L: linux-pci@vger.kernel.org
S: Supported
F: drivers/pci/pcie/bwctrl.c
F: drivers/thermal/pcie_cooling.c
F: include/linux/pci-bwctrl.h
F: tools/testing/selftests/pcie_bwctrl/
PCIE DRIVER FOR AMAZON ANNAPURNA LABS
M: Jonathan Chocron <jonnyc@amazon.com>
L: linux-pci@vger.kernel.org

View File

@@ -19,6 +19,8 @@
#include <linux/types.h>
#include <linux/pm_runtime.h>
#include <linux/pci.h>
#include "../pci.h"
#include "pciehp.h"
/* The following routines constitute the bulk of the
@@ -127,6 +129,9 @@ static void remove_board(struct controller *ctrl, bool safe_removal)
pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
INDICATOR_NOOP);
/* Don't carry LBMS indications across */
pcie_reset_lbms_count(ctrl->pcie->port);
}
static int pciehp_enable_slot(struct controller *ctrl);

View File

@@ -319,7 +319,7 @@ int pciehp_check_link_status(struct controller *ctrl)
return -1;
}
pcie_update_link_speed(ctrl->pcie->port->subordinate, lnk_status);
__pcie_update_link_speed(ctrl->pcie->port->subordinate, lnk_status);
if (!found) {
ctrl_info(ctrl, "Slot(%s): No device found\n",

View File

@@ -4740,7 +4740,7 @@ int pcie_retrain_link(struct pci_dev *pdev, bool use_lt)
* to track link speed or width changes made by hardware itself
* in attempt to correct unreliable link operation.
*/
pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
pcie_reset_lbms_count(pdev);
return rc;
}
@@ -6188,39 +6188,65 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
}
EXPORT_SYMBOL(pcie_bandwidth_available);
/**
* pcie_get_supported_speeds - query Supported Link Speed Vector
* @dev: PCI device to query
*
* Query @dev supported link speeds.
*
* Implementation Note in PCIe r6.0 sec 7.5.3.18 recommends determining
* supported link speeds using the Supported Link Speeds Vector in the Link
* Capabilities 2 Register (when available).
*
* Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18.
*
* Without Link Capabilities 2, i.e., prior to PCIe r3.0, Supported Link
* Speeds field in Link Capabilities is used and only 2.5 GT/s and 5.0 GT/s
* speeds were defined.
*
* For @dev without Supported Link Speed Vector, the field is synthesized
* from the Max Link Speed field in the Link Capabilities Register.
*
* Return: Supported Link Speeds Vector (+ reserved 0 at LSB).
*/
u8 pcie_get_supported_speeds(struct pci_dev *dev)
{
u32 lnkcap2, lnkcap;
u8 speeds;
/*
* Speeds retain the reserved 0 at LSB before PCIe Supported Link
* Speeds Vector to allow using SLS Vector bit defines directly.
*/
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS;
/* PCIe r3.0-compliant */
if (speeds)
return speeds;
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
/* Synthesize from the Max Link Speed field */
if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB)
speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB;
else if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_2_5GB)
speeds = PCI_EXP_LNKCAP2_SLS_2_5GB;
return speeds;
}
/**
* pcie_get_speed_cap - query for the PCI device's link speed capability
* @dev: PCI device to query
*
* Query the PCI device speed capability. Return the maximum link speed
* supported by the device.
* Query the PCI device speed capability.
*
* Return: the maximum link speed supported by the device.
*/
enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
{
u32 lnkcap2, lnkcap;
/*
* Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18. The
* implementation note there recommends using the Supported Link
* Speeds Vector in Link Capabilities 2 when supported.
*
* Without Link Capabilities 2, i.e., prior to PCIe r3.0, software
* should use the Supported Link Speeds field in Link Capabilities,
* where only 2.5 GT/s and 5.0 GT/s speeds were defined.
*/
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
/* PCIe r3.0-compliant */
if (lnkcap2)
return PCIE_LNKCAP2_SLS2SPEED(lnkcap2);
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB)
return PCIE_SPEED_5_0GT;
else if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_2_5GB)
return PCIE_SPEED_2_5GT;
return PCI_SPEED_UNKNOWN;
return PCIE_LNKCAP2_SLS2SPEED(dev->supported_speeds);
}
EXPORT_SYMBOL(pcie_get_speed_cap);

View File

@@ -331,6 +331,17 @@ void pci_disable_bridge_window(struct pci_dev *dev);
struct pci_bus *pci_bus_get(struct pci_bus *bus);
void pci_bus_put(struct pci_bus *bus);
#define PCIE_LNKCAP_SLS2SPEED(lnkcap) \
({ \
((lnkcap) == PCI_EXP_LNKCAP_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
(lnkcap) == PCI_EXP_LNKCAP_SLS_32_0GB ? PCIE_SPEED_32_0GT : \
(lnkcap) == PCI_EXP_LNKCAP_SLS_16_0GB ? PCIE_SPEED_16_0GT : \
(lnkcap) == PCI_EXP_LNKCAP_SLS_8_0GB ? PCIE_SPEED_8_0GT : \
(lnkcap) == PCI_EXP_LNKCAP_SLS_5_0GB ? PCIE_SPEED_5_0GT : \
(lnkcap) == PCI_EXP_LNKCAP_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
PCI_SPEED_UNKNOWN); \
})
/* PCIe link information from Link Capabilities 2 */
#define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \
((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
@@ -341,6 +352,15 @@ void pci_bus_put(struct pci_bus *bus);
(lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
PCI_SPEED_UNKNOWN)
#define PCIE_LNKCTL2_TLS2SPEED(lnkctl2) \
((lnkctl2) == PCI_EXP_LNKCTL2_TLS_64_0GT ? PCIE_SPEED_64_0GT : \
(lnkctl2) == PCI_EXP_LNKCTL2_TLS_32_0GT ? PCIE_SPEED_32_0GT : \
(lnkctl2) == PCI_EXP_LNKCTL2_TLS_16_0GT ? PCIE_SPEED_16_0GT : \
(lnkctl2) == PCI_EXP_LNKCTL2_TLS_8_0GT ? PCIE_SPEED_8_0GT : \
(lnkctl2) == PCI_EXP_LNKCTL2_TLS_5_0GT ? PCIE_SPEED_5_0GT : \
(lnkctl2) == PCI_EXP_LNKCTL2_TLS_2_5GT ? PCIE_SPEED_2_5GT : \
PCI_SPEED_UNKNOWN)
/* PCIe speed to Mb/s reduced by encoding overhead */
#define PCIE_SPEED2MBS_ENC(speed) \
((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \
@@ -373,12 +393,16 @@ static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed)
return -EINVAL;
}
u8 pcie_get_supported_speeds(struct pci_dev *dev);
const char *pci_speed_string(enum pci_bus_speed speed);
enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
void pcie_report_downtraining(struct pci_dev *dev);
void pcie_update_link_speed(struct pci_bus *bus, u16 link_status);
static inline void __pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
{
bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS];
}
void pcie_update_link_speed(struct pci_bus *bus);
/* Single Root I/O Virtualization */
struct pci_sriov {
@@ -692,6 +716,17 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
static inline void pcie_ecrc_get_policy(char *str) { }
#endif
#ifdef CONFIG_PCIEPORTBUS
void pcie_reset_lbms_count(struct pci_dev *port);
int pcie_lbms_count(struct pci_dev *port, unsigned long *val);
#else
static inline void pcie_reset_lbms_count(struct pci_dev *port) {}
static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
{
return -EOPNOTSUPP;
}
#endif
struct pci_dev_reset_methods {
u16 vendor;
u16 device;

View File

@@ -4,7 +4,7 @@
pcieportdrv-y := portdrv.o rcec.o
obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o
obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o
obj-y += aspm.o
obj-$(CONFIG_PCIEAER) += aer.o err.o

366
drivers/pci/pcie/bwctrl.c Normal file
View File

@@ -0,0 +1,366 @@
// SPDX-License-Identifier: GPL-2.0+
/*
* PCIe bandwidth controller
*
* Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
*
* Copyright (C) 2019 Dell Inc
* Copyright (C) 2023-2024 Intel Corporation
*
* The PCIe bandwidth controller provides a way to alter PCIe Link Speeds
* and notify the operating system when the Link Width or Speed changes. The
* notification capability is required for all Root Ports and Downstream
* Ports supporting Link Width wider than x1 and/or multiple Link Speeds.
*
* This service port driver hooks into the Bandwidth Notification interrupt
* watching for changes or links becoming degraded in operation. It updates
* the cached Current Link Speed that is exposed to user space through sysfs.
*/
#define dev_fmt(fmt) "bwctrl: " fmt
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/bits.h>
#include <linux/cleanup.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/pci-bwctrl.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/types.h>
#include "../pci.h"
#include "portdrv.h"
/**
* struct pcie_bwctrl_data - PCIe bandwidth controller
* @set_speed_mutex: Serializes link speed changes
* @lbms_count: Count for LBMS (since last reset)
* @cdev: Thermal cooling device associated with the port
*/
struct pcie_bwctrl_data {
struct mutex set_speed_mutex;
atomic_t lbms_count;
struct thermal_cooling_device *cdev;
};
/*
* Prevent port removal during LBMS count accessors and Link Speed changes.
*
* These have to be differentiated because pcie_bwctrl_change_speed() calls
* pcie_retrain_link() which uses LBMS count reset accessor on success
* (using just one rwsem triggers "possible recursive locking detected"
* warning).
*/
static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem);
static DECLARE_RWSEM(pcie_bwctrl_setspeed_rwsem);
static bool pcie_valid_speed(enum pci_bus_speed speed)
{
return (speed >= PCIE_SPEED_2_5GT) && (speed <= PCIE_SPEED_64_0GT);
}
static u16 pci_bus_speed2lnkctl2(enum pci_bus_speed speed)
{
static const u8 speed_conv[] = {
[PCIE_SPEED_2_5GT] = PCI_EXP_LNKCTL2_TLS_2_5GT,
[PCIE_SPEED_5_0GT] = PCI_EXP_LNKCTL2_TLS_5_0GT,
[PCIE_SPEED_8_0GT] = PCI_EXP_LNKCTL2_TLS_8_0GT,
[PCIE_SPEED_16_0GT] = PCI_EXP_LNKCTL2_TLS_16_0GT,
[PCIE_SPEED_32_0GT] = PCI_EXP_LNKCTL2_TLS_32_0GT,
[PCIE_SPEED_64_0GT] = PCI_EXP_LNKCTL2_TLS_64_0GT,
};
if (WARN_ON_ONCE(!pcie_valid_speed(speed)))
return 0;
return speed_conv[speed];
}
static inline u16 pcie_supported_speeds2target_speed(u8 supported_speeds)
{
return __fls(supported_speeds);
}
/**
* pcie_bwctrl_select_speed - Select Target Link Speed
* @port: PCIe Port
* @speed_req: Requested PCIe Link Speed
*
* Select Target Link Speed by take into account Supported Link Speeds of
* both the Root Port and the Endpoint.
*
* Return: Target Link Speed (1=2.5GT/s, 2=5GT/s, 3=8GT/s, etc.)
*/
static u16 pcie_bwctrl_select_speed(struct pci_dev *port, enum pci_bus_speed speed_req)
{
struct pci_bus *bus = port->subordinate;
u8 desired_speeds, supported_speeds;
struct pci_dev *dev;
desired_speeds = GENMASK(pci_bus_speed2lnkctl2(speed_req),
__fls(PCI_EXP_LNKCAP2_SLS_2_5GB));
supported_speeds = port->supported_speeds;
if (bus) {
down_read(&pci_bus_sem);
dev = list_first_entry_or_null(&bus->devices, struct pci_dev, bus_list);
if (dev)
supported_speeds &= dev->supported_speeds;
up_read(&pci_bus_sem);
}
if (!supported_speeds)
return PCI_EXP_LNKCAP2_SLS_2_5GB;
return pcie_supported_speeds2target_speed(supported_speeds & desired_speeds);
}
static int pcie_bwctrl_change_speed(struct pci_dev *port, u16 target_speed, bool use_lt)
{
int ret;
ret = pcie_capability_clear_and_set_word(port, PCI_EXP_LNKCTL2,
PCI_EXP_LNKCTL2_TLS, target_speed);
if (ret != PCIBIOS_SUCCESSFUL)
return pcibios_err_to_errno(ret);
ret = pcie_retrain_link(port, use_lt);
if (ret < 0)
return ret;
/*
* Ensure link speed updates also with platforms that have problems
* with notifications.
*/
if (port->subordinate)
pcie_update_link_speed(port->subordinate);
return 0;
}
/**
* pcie_set_target_speed - Set downstream Link Speed for PCIe Port
* @port: PCIe Port
* @speed_req: Requested PCIe Link Speed
* @use_lt: Wait for the LT or DLLLA bit to detect the end of link training
*
* Attempt to set PCIe Port Link Speed to @speed_req. @speed_req may be
* adjusted downwards to the best speed supported by both the Port and PCIe
* Device underneath it.
*
* Return:
* * 0 - on success
* * -EINVAL - @speed_req is not a PCIe Link Speed
* * -ENODEV - @port is not controllable
* * -ETIMEDOUT - changing Link Speed took too long
* * -EAGAIN - Link Speed was changed but @speed_req was not achieved
*/
int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
bool use_lt)
{
struct pci_bus *bus = port->subordinate;
u16 target_speed;
int ret;
if (WARN_ON_ONCE(!pcie_valid_speed(speed_req)))
return -EINVAL;
if (bus && bus->cur_bus_speed == speed_req)
return 0;
target_speed = pcie_bwctrl_select_speed(port, speed_req);
scoped_guard(rwsem_read, &pcie_bwctrl_setspeed_rwsem) {
struct pcie_bwctrl_data *data = port->link_bwctrl;
/*
* port->link_bwctrl is NULL during initial scan when called
* e.g. from the Target Speed quirk.
*/
if (data)
mutex_lock(&data->set_speed_mutex);
ret = pcie_bwctrl_change_speed(port, target_speed, use_lt);
if (data)
mutex_unlock(&data->set_speed_mutex);
}
/*
* Despite setting higher speed into the Target Link Speed, empty
* bus won't train to 5GT+ speeds.
*/
if (!ret && bus && bus->cur_bus_speed != speed_req &&
!list_empty(&bus->devices))
ret = -EAGAIN;
return ret;
}
static void pcie_bwnotif_enable(struct pcie_device *srv)
{
struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
struct pci_dev *port = srv->port;
u16 link_status;
int ret;
/* Count LBMS seen so far as one */
ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
if (ret == PCIBIOS_SUCCESSFUL && link_status & PCI_EXP_LNKSTA_LBMS)
atomic_inc(&data->lbms_count);
pcie_capability_set_word(port, PCI_EXP_LNKCTL,
PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
pcie_capability_write_word(port, PCI_EXP_LNKSTA,
PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
/*
* Update after enabling notifications & clearing status bits ensures
* link speed is up to date.
*/
pcie_update_link_speed(port->subordinate);
}
static void pcie_bwnotif_disable(struct pci_dev *port)
{
pcie_capability_clear_word(port, PCI_EXP_LNKCTL,
PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
}
static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
{
struct pcie_device *srv = context;
struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
struct pci_dev *port = srv->port;
u16 link_status, events;
int ret;
ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
if (ret != PCIBIOS_SUCCESSFUL)
return IRQ_NONE;
events = link_status & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
if (!events)
return IRQ_NONE;
if (events & PCI_EXP_LNKSTA_LBMS)
atomic_inc(&data->lbms_count);
pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
/*
* Interrupts will not be triggered from any further Link Speed
* change until LBMS is cleared by the write. Therefore, re-read the
* speed (inside pcie_update_link_speed()) after LBMS has been
* cleared to avoid missing link speed changes.
*/
pcie_update_link_speed(port->subordinate);
return IRQ_HANDLED;
}
void pcie_reset_lbms_count(struct pci_dev *port)
{
struct pcie_bwctrl_data *data;
guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
data = port->link_bwctrl;
if (data)
atomic_set(&data->lbms_count, 0);
else
pcie_capability_write_word(port, PCI_EXP_LNKSTA,
PCI_EXP_LNKSTA_LBMS);
}
int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
{
struct pcie_bwctrl_data *data;
guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
data = port->link_bwctrl;
if (!data)
return -ENOTTY;
*val = atomic_read(&data->lbms_count);
return 0;
}
static int pcie_bwnotif_probe(struct pcie_device *srv)
{
struct pci_dev *port = srv->port;
int ret;
struct pcie_bwctrl_data *data = devm_kzalloc(&srv->device,
sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
ret = devm_mutex_init(&srv->device, &data->set_speed_mutex);
if (ret)
return ret;
ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq,
IRQF_SHARED, "PCIe bwctrl", srv);
if (ret)
return ret;
scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
port->link_bwctrl = no_free_ptr(data);
pcie_bwnotif_enable(srv);
}
}
pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
/* Don't fail on errors. Don't leave IS_ERR() "pointer" into ->cdev */
port->link_bwctrl->cdev = pcie_cooling_device_register(port);
if (IS_ERR(port->link_bwctrl->cdev))
port->link_bwctrl->cdev = NULL;
return 0;
}
static void pcie_bwnotif_remove(struct pcie_device *srv)
{
struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
pcie_cooling_device_unregister(data->cdev);
pcie_bwnotif_disable(srv->port);
scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem)
scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
srv->port->link_bwctrl = NULL;
}
static int pcie_bwnotif_suspend(struct pcie_device *srv)
{
pcie_bwnotif_disable(srv->port);
return 0;
}
static int pcie_bwnotif_resume(struct pcie_device *srv)
{
pcie_bwnotif_enable(srv);
return 0;
}
static struct pcie_port_service_driver pcie_bwctrl_driver = {
.name = "pcie_bwctrl",
.port_type = PCIE_ANY_PORT,
.service = PCIE_PORT_SERVICE_BWCTRL,
.probe = pcie_bwnotif_probe,
.suspend = pcie_bwnotif_suspend,
.resume = pcie_bwnotif_resume,
.remove = pcie_bwnotif_remove,
};
int __init pcie_bwctrl_init(void)
{
return pcie_port_service_register(&pcie_bwctrl_driver);
}

View File

@@ -68,7 +68,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
*/
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
PCIE_PORT_SERVICE_BWNOTIF)) {
PCIE_PORT_SERVICE_BWCTRL)) {
pcie_capability_read_word(dev, PCI_EXP_FLAGS, &reg16);
*pme = FIELD_GET(PCI_EXP_FLAGS_IRQ, reg16);
nvec = *pme + 1;
@@ -150,11 +150,11 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
PCIE_PORT_SERVICE_BWNOTIF)) {
PCIE_PORT_SERVICE_BWCTRL)) {
pcie_irq = pci_irq_vector(dev, pme);
irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
irqs[PCIE_PORT_SERVICE_BWCTRL_SHIFT] = pcie_irq;
}
if (mask & PCIE_PORT_SERVICE_AER)
@@ -271,7 +271,7 @@ static int get_port_device_capability(struct pci_dev *dev)
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap);
if (linkcap & PCI_EXP_LNKCAP_LBNC)
services |= PCIE_PORT_SERVICE_BWNOTIF;
services |= PCIE_PORT_SERVICE_BWCTRL;
}
return services;
@@ -828,6 +828,7 @@ static void __init pcie_init_services(void)
pcie_aer_init();
pcie_pme_init();
pcie_dpc_init();
pcie_bwctrl_init();
pcie_hp_init();
}

View File

@@ -20,8 +20,8 @@
#define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT)
#define PCIE_PORT_SERVICE_DPC_SHIFT 3 /* Downstream Port Containment */
#define PCIE_PORT_SERVICE_DPC (1 << PCIE_PORT_SERVICE_DPC_SHIFT)
#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT 4 /* Bandwidth notification */
#define PCIE_PORT_SERVICE_BWNOTIF (1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
#define PCIE_PORT_SERVICE_BWCTRL_SHIFT 4 /* Bandwidth Controller (notifications) */
#define PCIE_PORT_SERVICE_BWCTRL (1 << PCIE_PORT_SERVICE_BWCTRL_SHIFT)
#define PCIE_PORT_DEVICE_MAXSERVICES 5
@@ -51,6 +51,8 @@ int pcie_dpc_init(void);
static inline int pcie_dpc_init(void) { return 0; }
#endif
int pcie_bwctrl_init(void);
/* Port Type */
#define PCIE_ANY_PORT (~0)

View File

@@ -742,9 +742,13 @@ const char *pci_speed_string(enum pci_bus_speed speed)
}
EXPORT_SYMBOL_GPL(pci_speed_string);
void pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
void pcie_update_link_speed(struct pci_bus *bus)
{
bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS];
struct pci_dev *bridge = bus->self;
u16 linksta;
pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta);
__pcie_update_link_speed(bus, linksta);
}
EXPORT_SYMBOL_GPL(pcie_update_link_speed);
@@ -827,13 +831,11 @@ static void pci_set_bus_speed(struct pci_bus *bus)
if (pci_is_pcie(bridge)) {
u32 linkcap;
u16 linksta;
pcie_capability_read_dword(bridge, PCI_EXP_LNKCAP, &linkcap);
bus->max_bus_speed = pcie_link_speed[linkcap & PCI_EXP_LNKCAP_SLS];
pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta);
pcie_update_link_speed(bus, linksta);
pcie_update_link_speed(bus);
}
}
@@ -1947,6 +1949,9 @@ int pci_setup_device(struct pci_dev *dev)
set_pcie_untrusted(dev);
if (pci_is_pcie(dev))
dev->supported_speeds = pcie_get_supported_speeds(dev);
/* "Unknown power state" */
dev->current_state = PCI_UNKNOWN;

View File

@@ -33,6 +33,18 @@
#include <linux/switchtec.h>
#include "pci.h"
static bool pcie_lbms_seen(struct pci_dev *dev, u16 lnksta)
{
unsigned long count;
int ret;
ret = pcie_lbms_count(dev, &count);
if (ret < 0)
return lnksta & PCI_EXP_LNKSTA_LBMS;
return count > 0;
}
/*
* Retrain the link of a downstream PCIe port by hand if necessary.
*
@@ -96,22 +108,16 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
pcie_capability_read_word(dev, PCI_EXP_LNKCTL2, &lnkctl2);
pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
if ((lnksta & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_DLLLA)) ==
PCI_EXP_LNKSTA_LBMS) {
if (!(lnksta & PCI_EXP_LNKSTA_DLLLA) && pcie_lbms_seen(dev, lnksta)) {
u16 oldlnkctl2 = lnkctl2;
pci_info(dev, "broken device, retraining non-functional downstream link at 2.5GT/s\n");
lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT;
pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
ret = pcie_retrain_link(dev, false);
ret = pcie_set_target_speed(dev, PCIE_SPEED_2_5GT, false);
if (ret) {
pci_info(dev, "retraining failed\n");
pcie_capability_write_word(dev, PCI_EXP_LNKCTL2,
oldlnkctl2);
pcie_retrain_link(dev, true);
pcie_set_target_speed(dev, PCIE_LNKCTL2_TLS2SPEED(oldlnkctl2),
true);
return ret;
}
@@ -125,11 +131,7 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
pci_info(dev, "removing 2.5GT/s downstream link speed restriction\n");
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS;
pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
ret = pcie_retrain_link(dev, false);
ret = pcie_set_target_speed(dev, PCIE_LNKCAP_SLS2SPEED(lnkcap), false);
if (ret) {
pci_info(dev, "retraining failed\n");
return ret;

View File

@@ -220,6 +220,15 @@ config DEVFREQ_THERMAL
If you want this support, you should say Y here.
config PCIE_THERMAL
bool "PCIe cooling support"
depends on PCIEPORTBUS
help
This implements PCIe cooling mechanism through bandwidth reduction
for PCIe devices.
If you want this support, you should say Y here.
config THERMAL_EMULATION
bool "Thermal emulation mode support"
help

View File

@@ -31,6 +31,8 @@ thermal_sys-$(CONFIG_CPU_IDLE_THERMAL) += cpuidle_cooling.o
# devfreq cooling
thermal_sys-$(CONFIG_DEVFREQ_THERMAL) += devfreq_cooling.o
thermal_sys-$(CONFIG_PCIE_THERMAL) += pcie_cooling.o
obj-$(CONFIG_K3_THERMAL) += k3_bandgap.o k3_j72xx_bandgap.o
# platform thermal drivers
obj-y += broadcom/

View File

@@ -0,0 +1,80 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* PCIe cooling device
*
* Copyright (C) 2023-2024 Intel Corporation
*/
#include <linux/build_bug.h>
#include <linux/cleanup.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/pci-bwctrl.h>
#include <linux/slab.h>
#include <linux/sprintf.h>
#include <linux/thermal.h>
#define COOLING_DEV_TYPE_PREFIX "PCIe_Port_Link_Speed_"
static int pcie_cooling_get_max_level(struct thermal_cooling_device *cdev, unsigned long *state)
{
struct pci_dev *port = cdev->devdata;
/* cooling state 0 is same as the maximum PCIe speed */
*state = port->subordinate->max_bus_speed - PCIE_SPEED_2_5GT;
return 0;
}
static int pcie_cooling_get_cur_level(struct thermal_cooling_device *cdev, unsigned long *state)
{
struct pci_dev *port = cdev->devdata;
/* cooling state 0 is same as the maximum PCIe speed */
*state = cdev->max_state - (port->subordinate->cur_bus_speed - PCIE_SPEED_2_5GT);
return 0;
}
static int pcie_cooling_set_cur_level(struct thermal_cooling_device *cdev, unsigned long state)
{
struct pci_dev *port = cdev->devdata;
enum pci_bus_speed speed;
/* cooling state 0 is same as the maximum PCIe speed */
speed = (cdev->max_state - state) + PCIE_SPEED_2_5GT;
return pcie_set_target_speed(port, speed, true);
}
static struct thermal_cooling_device_ops pcie_cooling_ops = {
.get_max_state = pcie_cooling_get_max_level,
.get_cur_state = pcie_cooling_get_cur_level,
.set_cur_state = pcie_cooling_set_cur_level,
};
struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port)
{
char *name __free(kfree) =
kasprintf(GFP_KERNEL, COOLING_DEV_TYPE_PREFIX "%s", pci_name(port));
if (!name)
return ERR_PTR(-ENOMEM);
return thermal_cooling_device_register(name, port, &pcie_cooling_ops);
}
void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev)
{
thermal_cooling_device_unregister(cdev);
}
/* For bus_speed <-> state arithmetic */
static_assert(PCIE_SPEED_2_5GT + 1 == PCIE_SPEED_5_0GT);
static_assert(PCIE_SPEED_5_0GT + 1 == PCIE_SPEED_8_0GT);
static_assert(PCIE_SPEED_8_0GT + 1 == PCIE_SPEED_16_0GT);
static_assert(PCIE_SPEED_16_0GT + 1 == PCIE_SPEED_32_0GT);
static_assert(PCIE_SPEED_32_0GT + 1 == PCIE_SPEED_64_0GT);
MODULE_AUTHOR("Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>");
MODULE_DESCRIPTION("PCIe cooling driver");

View File

@@ -0,0 +1,28 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* PCIe bandwidth controller
*
* Copyright (C) 2023-2024 Intel Corporation
*/
#ifndef LINUX_PCI_BWCTRL_H
#define LINUX_PCI_BWCTRL_H
#include <linux/pci.h>
struct thermal_cooling_device;
#ifdef CONFIG_PCIE_THERMAL
struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port);
void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev);
#else
static inline struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port)
{
return NULL;
}
static inline void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev)
{
}
#endif
#endif

View File

@@ -313,12 +313,20 @@ struct pci_vpd {
};
struct irq_affinity;
struct pcie_bwctrl_data;
struct pcie_link_state;
struct pci_sriov;
struct pci_p2pdma;
struct rcec_ea;
/* The pci_dev structure describes PCI devices */
/* struct pci_dev - describes a PCI device
*
* @supported_speeds: PCIe Supported Link Speeds Vector (+ reserved 0 at
* LSB). 0 when the supported speeds cannot be
* determined (e.g., for Root Complex Integrated
* Endpoints without the relevant Capability
* Registers).
*/
struct pci_dev {
struct list_head bus_list; /* Node in per-bus list */
struct pci_bus *bus; /* Bus this device is on */
@@ -495,6 +503,7 @@ struct pci_dev {
unsigned int dpc_rp_extensions:1;
u8 dpc_rp_log_size;
#endif
struct pcie_bwctrl_data *link_bwctrl;
#ifdef CONFIG_PCI_ATS
union {
struct pci_sriov *sriov; /* PF: SR-IOV info */
@@ -522,6 +531,7 @@ struct pci_dev {
struct npem *npem; /* Native PCIe Enclosure Management */
#endif
u16 acs_cap; /* ACS Capability offset */
u8 supported_speeds; /* Supported Link Speeds Vector */
phys_addr_t rom; /* Physical address if not from BAR */
size_t romlen; /* Length if not from BAR */
/*
@@ -1274,6 +1284,7 @@ static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev,
{
switch (pos) {
case PCI_EXP_LNKCTL:
case PCI_EXP_LNKCTL2:
case PCI_EXP_RTCTL:
return pcie_capability_clear_and_set_word_locked(dev, pos,
clear, set);
@@ -1786,9 +1797,19 @@ static inline int pci_irqd_intx_xlate(struct irq_domain *d,
#ifdef CONFIG_PCIEPORTBUS
extern bool pcie_ports_disabled;
extern bool pcie_ports_native;
int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
bool use_lt);
#else
#define pcie_ports_disabled true
#define pcie_ports_native false
static inline int pcie_set_target_speed(struct pci_dev *port,
enum pci_bus_speed speed_req,
bool use_lt)
{
return -EOPNOTSUPP;
}
#endif
#define PCIE_LINK_STATE_L0S (BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */

View File

@@ -678,6 +678,7 @@
#define PCI_EXP_DEVSTA2 0x2a /* Device Status 2 */
#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 0x2c /* end of v2 EPs w/o link */
#define PCI_EXP_LNKCAP2 0x2c /* Link Capabilities 2 */
#define PCI_EXP_LNKCAP2_SLS 0x000000fe /* Supported Link Speeds Vector */
#define PCI_EXP_LNKCAP2_SLS_2_5GB 0x00000002 /* Supported Speed 2.5GT/s */
#define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5GT/s */
#define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8GT/s */

View File

@@ -72,6 +72,7 @@ TARGETS += net/packetdrill
TARGETS += net/rds
TARGETS += net/tcp_ao
TARGETS += nsfs
TARGETS += pcie_bwctrl
TARGETS += perf_events
TARGETS += pidfd
TARGETS += pid_namespace

View File

@@ -0,0 +1,2 @@
TEST_PROGS = set_pcie_cooling_state.sh
include ../lib.mk

View File

@@ -0,0 +1,122 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0-or-later
SYSFS=
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
retval=0
skipmsg="skip all tests:"
PCIEPORTTYPE="PCIe_Port_Link_Speed"
prerequisite()
{
local ports
if [ $UID != 0 ]; then
echo $skipmsg must be run as root >&2
exit $ksft_skip
fi
SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
if [ ! -d "$SYSFS" ]; then
echo $skipmsg sysfs is not mounted >&2
exit $ksft_skip
fi
if ! ls $SYSFS/class/thermal/cooling_device* > /dev/null 2>&1; then
echo $skipmsg thermal cooling devices missing >&2
exit $ksft_skip
fi
ports=`grep -e "^$PCIEPORTTYPE" $SYSFS/class/thermal/cooling_device*/type | wc -l`
if [ $ports -eq 0 ]; then
echo $skipmsg pcie cooling devices missing >&2
exit $ksft_skip
fi
}
testport=
find_pcie_port()
{
local patt="$1"
local pcieports
local max
local cur
local delta
local bestdelta=-1
pcieports=`grep -l -F -e "$patt" /sys/class/thermal/cooling_device*/type`
if [ -z "$pcieports" ]; then
return
fi
pcieports=${pcieports//\/type/}
# Find the port with the highest PCIe Link Speed
for port in $pcieports; do
max=`cat $port/max_state`
cur=`cat $port/cur_state`
delta=$((max-cur))
if [ $delta -gt $bestdelta ]; then
testport="$port"
bestdelta=$delta
fi
done
}
sysfspcidev=
find_sysfs_pci_dev()
{
local typefile="$1/type"
local pcidir
pcidir="$SYSFS/bus/pci/devices/`sed -e "s|^${PCIEPORTTYPE}_||g" $typefile`"
if [ -r "$pcidir/current_link_speed" ]; then
sysfspcidev="$pcidir/current_link_speed"
fi
}
usage()
{
echo "Usage $0 [ -d dev ]"
echo -e "\t-d: PCIe port BDF string (e.g., 0000:00:04.0)"
}
pattern="$PCIEPORTTYPE"
parse_arguments()
{
while getopts d:h opt; do
case $opt in
h)
usage "$0"
exit 0
;;
d)
pattern="$PCIEPORTTYPE_$OPTARG"
;;
*)
usage "$0"
exit 0
;;
esac
done
}
parse_arguments "$@"
prerequisite
find_pcie_port "$pattern"
if [ -z "$testport" ]; then
echo $skipmsg "pcie cooling device not found from sysfs" >&2
exit $ksft_skip
fi
find_sysfs_pci_dev "$testport"
if [ -z "$sysfspcidev" ]; then
echo $skipmsg "PCIe port device not found from sysfs" >&2
exit $ksft_skip
fi
./set_pcie_speed.sh "$testport" "$sysfspcidev"
retval=$?
exit $retval

View File

@@ -0,0 +1,67 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0-or-later
set -e
TESTNAME=set_pcie_speed
declare -a PCIELINKSPEED=(
"2.5 GT/s PCIe"
"5.0 GT/s PCIe"
"8.0 GT/s PCIe"
"16.0 GT/s PCIe"
"32.0 GT/s PCIe"
"64.0 GT/s PCIe"
)
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
retval=0
coolingdev="$1"
statefile="$coolingdev/cur_state"
maxfile="$coolingdev/max_state"
linkspeedfile="$2"
oldstate=`cat $statefile`
maxstate=`cat $maxfile`
set_state()
{
local state=$1
local linkspeed
local expected_linkspeed
echo $state > $statefile
sleep 1
linkspeed="`cat $linkspeedfile`"
expected_linkspeed=$((maxstate-state))
expected_str="${PCIELINKSPEED[$expected_linkspeed]}"
if [ ! "${expected_str}" = "${linkspeed}" ]; then
echo "$TESTNAME failed: expected: ${expected_str}; got ${linkspeed}"
retval=1
fi
}
cleanup_skip ()
{
set_state $oldstate
exit $ksft_skip
}
trap cleanup_skip EXIT
echo "$TESTNAME: testing states $maxstate .. $oldstate with $coolingdev"
for i in $(seq $maxstate -1 $oldstate); do
set_state "$i"
done
trap EXIT
if [ $retval -eq 0 ]; then
echo "$TESTNAME [PASS]"
else
echo "$TESTNAME [FAIL]"
fi
exit $retval