Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull virtio updates from Michael Tsirkin:

 - in-order support in virtio core

 - multiple address space support in vduse

 - fixes, cleanups all over the place, notably dma alignment fixes for
   non-cache-coherent systems

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (59 commits)
  vduse: avoid adding implicit padding
  vhost: fix caching attributes of MMIO regions by setting them explicitly
  vdpa/mlx5: update MAC address handling in mlx5_vdpa_set_attr()
  vdpa/mlx5: reuse common function for MAC address updates
  vdpa/mlx5: update mlx_features with driver state check
  crypto: virtio: Replace package id with numa node id
  crypto: virtio: Remove duplicated virtqueue_kick in virtio_crypto_skcipher_crypt_req
  crypto: virtio: Add spinlock protection with virtqueue notification
  Documentation: Add documentation for VDUSE Address Space IDs
  vduse: bump version number
  vduse: add vq group asid support
  vduse: merge tree search logic of IOTLB_GET_FD and IOTLB_GET_INFO ioctls
  vduse: take out allocations from vduse_dev_alloc_coherent
  vduse: remove unused vaddr parameter of vduse_domain_free_coherent
  vduse: refactor vdpa_dev_add for goto err handling
  vhost: forbid change vq groups ASID if DRIVER_OK is set
  vdpa: document set_group_asid thread safety
  vduse: return internal vq group struct as map token
  vduse: add vq group support
  vduse: add v1 API definition
  ...
This commit is contained in:
Linus Torvalds
2026-02-13 12:02:18 -08:00
23 changed files with 1561 additions and 513 deletions

View File

@@ -146,6 +146,58 @@ What about block I/O and networking buffers? The block I/O and
networking subsystems make sure that the buffers they use are valid
for you to DMA from/to.
__dma_from_device_group_begin/end annotations
=============================================
As explained previously, when a structure contains a DMA_FROM_DEVICE /
DMA_BIDIRECTIONAL buffer (device writes to memory) alongside fields that the
CPU writes to, cache line sharing between the DMA buffer and CPU-written fields
can cause data corruption on CPUs with DMA-incoherent caches.
The ``__dma_from_device_group_begin(GROUP)/__dma_from_device_group_end(GROUP)``
macros ensure proper alignment to prevent this::
struct my_device {
spinlock_t lock1;
__dma_from_device_group_begin();
char dma_buffer1[16];
char dma_buffer2[16];
__dma_from_device_group_end();
spinlock_t lock2;
};
To isolate a DMA buffer from adjacent fields, use
``__dma_from_device_group_begin(GROUP)`` before the first DMA buffer
field and ``__dma_from_device_group_end(GROUP)`` after the last DMA
buffer field (with the same GROUP name). This protects both the head
and tail of the buffer from cache line sharing.
The GROUP parameter is an optional identifier that names the DMA buffer group
(in case you have several in the same structure)::
struct my_device {
spinlock_t lock1;
__dma_from_device_group_begin(buffer1);
char dma_buffer1[16];
__dma_from_device_group_end(buffer1);
spinlock_t lock2;
__dma_from_device_group_begin(buffer2);
char dma_buffer2[16];
__dma_from_device_group_end(buffer2);
};
On cache-coherent platforms these macros expand to zero-length array markers.
On non-coherent platforms, they also ensure the minimal DMA alignment, which
can be as large as 128 bytes.
.. note::
It is allowed (though somewhat fragile) to include extra fields, not
intended for DMA from the device, within the group (in order to pack the
structure tightly) - but only as long as the CPU does not write these
fields while any fields in the group are mapped for DMA_FROM_DEVICE or
DMA_BIDIRECTIONAL.
DMA addressing capabilities
===========================

View File

@@ -148,3 +148,12 @@ DMA_ATTR_MMIO is appropriate.
For architectures that require cache flushing for DMA coherence
DMA_ATTR_MMIO will not perform any cache flushing. The address
provided must never be mapped cacheable into the CPU.
DMA_ATTR_CPU_CACHE_CLEAN
------------------------
This attribute indicates the CPU will not dirty any cacheline overlapping this
DMA_FROM_DEVICE/DMA_BIDIRECTIONAL buffer while it is mapped. This allows
multiple small buffers to safely share a cacheline without risk of data
corruption, suppressing DMA debug warnings about overlapping mappings.
All mappings sharing a cacheline should have this attribute.

View File

@@ -230,4 +230,57 @@ able to start the dataplane processing as follows:
5. Inject an interrupt for specific virtqueue with the VDUSE_INJECT_VQ_IRQ ioctl
after the used ring is filled.
Enabling ASID (API version 1)
------------------------------
VDUSE supports per-address-space identifiers (ASIDs) starting with API
version 1. Set it up with ioctl(VDUSE_SET_API_VERSION) on `/dev/vduse/control`
and pass `VDUSE_API_VERSION_1` before creating a new VDUSE instance with
ioctl(VDUSE_CREATE_DEV).
Afterwards, you can use the member asid of ioctl(VDUSE_VQ_SETUP) argument to
select the address space of the IOTLB you are querying. The driver could
change the address space of any virtqueue group by using the
VDUSE_SET_VQ_GROUP_ASID VDUSE message type, and the VDUSE instance needs to
reply with VDUSE_REQ_RESULT_OK if it was possible to change it.
Similarly, you can use ioctl(VDUSE_IOTLB_GET_FD2) to obtain the file descriptor
describing an IOVA region of a specific ASID. Example usage:
.. code-block:: c
static void *iova_to_va(int dev_fd, uint32_t asid, uint64_t iova,
uint64_t *len)
{
int fd;
void *addr;
size_t size;
struct vduse_iotlb_entry_v2 entry = { 0 };
entry.v1.start = iova;
entry.v1.last = iova;
entry.asid = asid;
fd = ioctl(dev_fd, VDUSE_IOTLB_GET_FD2, &entry);
if (fd < 0)
return NULL;
size = entry.v1.last - entry.v1.start + 1;
*len = entry.v1.last - iova + 1;
addr = mmap(0, size, perm_to_prot(entry.v1.perm), MAP_SHARED,
fd, entry.v1.offset);
close(fd);
if (addr == MAP_FAILED)
return NULL;
/*
* Using some data structures such as linked list to store
* the iotlb mapping. The munmap(2) should be called for the
* cached mapping when the corresponding VDUSE_UPDATE_IOTLB
* message is received or the device is reset.
*/
return addr + iova - entry.v1.start;
}
For more details on the uAPI, please see include/uapi/linux/vduse.h.

View File

@@ -11,6 +11,7 @@
#include <linux/spinlock.h>
#include <linux/virtio.h>
#include <linux/virtio_rng.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -28,11 +29,13 @@ struct virtrng_info {
unsigned int data_avail;
unsigned int data_idx;
/* minimal size returned by rng_buffer_size() */
__dma_from_device_group_begin();
#if SMP_CACHE_BYTES < 32
u8 data[32];
#else
u8 data[SMP_CACHE_BYTES];
#endif
__dma_from_device_group_end();
};
static void random_recv_done(struct virtqueue *vq)

View File

@@ -10,6 +10,7 @@
*/
#include <linux/completion.h>
#include <linux/dma-mapping.h>
#include <linux/err.h>
#include <linux/gpio/driver.h>
#include <linux/io.h>
@@ -24,9 +25,13 @@
struct virtio_gpio_line {
struct mutex lock; /* Protects line operation */
struct completion completion;
struct virtio_gpio_request req ____cacheline_aligned;
struct virtio_gpio_response res ____cacheline_aligned;
unsigned int rxlen;
__dma_from_device_group_begin();
struct virtio_gpio_request req;
struct virtio_gpio_response res;
__dma_from_device_group_end();
};
struct vgpio_irq_line {
@@ -37,8 +42,10 @@ struct vgpio_irq_line {
bool update_pending;
bool queue_pending;
struct virtio_gpio_irq_request ireq ____cacheline_aligned;
struct virtio_gpio_irq_response ires ____cacheline_aligned;
__dma_from_device_group_begin();
struct virtio_gpio_irq_request ireq;
struct virtio_gpio_irq_response ires;
__dma_from_device_group_end();
};
struct virtio_gpio {

View File

@@ -29,6 +29,7 @@
#include <scsi/scsi_tcq.h>
#include <scsi/scsi_devinfo.h>
#include <linux/seqlock.h>
#include <linux/dma-mapping.h>
#include "sd.h"
@@ -61,7 +62,7 @@ struct virtio_scsi_cmd {
struct virtio_scsi_event_node {
struct virtio_scsi *vscsi;
struct virtio_scsi_event event;
struct virtio_scsi_event *event;
struct work_struct work;
};
@@ -89,6 +90,11 @@ struct virtio_scsi {
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
__dma_from_device_group_begin();
struct virtio_scsi_event events[VIRTIO_SCSI_EVENT_LEN];
__dma_from_device_group_end();
struct virtio_scsi_vq req_vqs[];
};
@@ -237,12 +243,12 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
unsigned long flags;
INIT_WORK(&event_node->work, virtscsi_handle_event);
sg_init_one(&sg, &event_node->event, sizeof(struct virtio_scsi_event));
sg_init_one(&sg, event_node->event, sizeof(struct virtio_scsi_event));
spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
GFP_ATOMIC);
err = virtqueue_add_inbuf_cache_clean(vscsi->event_vq.vq, &sg, 1, event_node,
GFP_ATOMIC);
if (!err)
virtqueue_kick(vscsi->event_vq.vq);
@@ -257,6 +263,7 @@ static int virtscsi_kick_event_all(struct virtio_scsi *vscsi)
for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) {
vscsi->event_list[i].vscsi = vscsi;
vscsi->event_list[i].event = &vscsi->events[i];
virtscsi_kick_event(vscsi, &vscsi->event_list[i]);
}
@@ -380,7 +387,7 @@ static void virtscsi_handle_event(struct work_struct *work)
struct virtio_scsi_event_node *event_node =
container_of(work, struct virtio_scsi_event_node, work);
struct virtio_scsi *vscsi = event_node->vscsi;
struct virtio_scsi_event *event = &event_node->event;
struct virtio_scsi_event *event = event_node->event;
if (event->event &
cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED)) {

View File

@@ -2125,6 +2125,74 @@ static void teardown_steering(struct mlx5_vdpa_net *ndev)
mlx5_destroy_flow_table(ndev->rxft);
}
static int mlx5_vdpa_change_mac(struct mlx5_vdpa_net *ndev,
struct mlx5_core_dev *pfmdev,
const u8 *new_mac)
{
struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
u8 old_mac[ETH_ALEN];
if (is_zero_ether_addr(new_mac))
return -EINVAL;
if (!is_zero_ether_addr(ndev->config.mac)) {
if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
ndev->config.mac);
return -EIO;
}
}
if (mlx5_mpfs_add_mac(pfmdev, (u8 *)new_mac)) {
mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
new_mac);
return -EIO;
}
/* backup the original mac address so that if failed to add the forward rules
* we could restore it
*/
ether_addr_copy(old_mac, ndev->config.mac);
ether_addr_copy(ndev->config.mac, new_mac);
/* Need recreate the flow table entry, so that the packet could forward back
*/
mac_vlan_del(ndev, old_mac, 0, false);
if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
/* Although it hardly run here, we still need double check */
if (is_zero_ether_addr(old_mac)) {
mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
return -EIO;
}
/* Try to restore original mac address to MFPS table, and try to restore
* the forward rule entry.
*/
if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
ndev->config.mac);
}
if (mlx5_mpfs_add_mac(pfmdev, old_mac)) {
mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
old_mac);
}
ether_addr_copy(ndev->config.mac, old_mac);
if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
return -EIO;
}
return 0;
}
static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
{
struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
@@ -2132,12 +2200,13 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
struct mlx5_core_dev *pfmdev;
size_t read;
u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
u8 mac[ETH_ALEN];
pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
switch (cmd) {
case VIRTIO_NET_CTRL_MAC_ADDR_SET:
read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov,
(void *)mac, ETH_ALEN);
if (read != ETH_ALEN)
break;
@@ -2145,66 +2214,8 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
status = VIRTIO_NET_OK;
break;
}
if (is_zero_ether_addr(mac))
break;
if (!is_zero_ether_addr(ndev->config.mac)) {
if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
ndev->config.mac);
break;
}
}
if (mlx5_mpfs_add_mac(pfmdev, mac)) {
mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
mac);
break;
}
/* backup the original mac address so that if failed to add the forward rules
* we could restore it
*/
memcpy(mac_back, ndev->config.mac, ETH_ALEN);
memcpy(ndev->config.mac, mac, ETH_ALEN);
/* Need recreate the flow table entry, so that the packet could forward back
*/
mac_vlan_del(ndev, mac_back, 0, false);
if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
/* Although it hardly run here, we still need double check */
if (is_zero_ether_addr(mac_back)) {
mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
break;
}
/* Try to restore original mac address to MFPS table, and try to restore
* the forward rule entry.
*/
if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
ndev->config.mac);
}
if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
mac_back);
}
memcpy(ndev->config.mac, mac_back, ETH_ALEN);
if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
break;
}
status = VIRTIO_NET_OK;
status = mlx5_vdpa_change_mac(ndev, pfmdev, mac) ? VIRTIO_NET_ERR :
VIRTIO_NET_OK;
break;
default:
@@ -3640,9 +3651,6 @@ static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
int err = 0;
if (group >= MLX5_VDPA_NUMVQ_GROUPS)
return -EINVAL;
mvdev->mres.group2asid[group] = asid;
mutex_lock(&mvdev->mres.lock);
@@ -4044,7 +4052,6 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *
static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev,
const struct vdpa_dev_set_config *add_config)
{
struct virtio_net_config *config;
struct mlx5_core_dev *pfmdev;
struct mlx5_vdpa_dev *mvdev;
struct mlx5_vdpa_net *ndev;
@@ -4054,16 +4061,23 @@ static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *
mvdev = to_mvdev(dev);
ndev = to_mlx5_vdpa_ndev(mvdev);
mdev = mvdev->mdev;
config = &ndev->config;
down_write(&ndev->reslock);
if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) {
ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
} else {
mlx5_vdpa_warn(mvdev, "device running, skip updating MAC\n");
err = -EBUSY;
goto out;
}
pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
err = mlx5_mpfs_add_mac(pfmdev, config->mac);
if (!err)
ether_addr_copy(config->mac, add_config->net.mac);
err = mlx5_vdpa_change_mac(ndev, pfmdev,
(u8 *)add_config->net.mac);
}
out:
up_write(&ndev->reslock);
return err;
}

View File

@@ -606,12 +606,6 @@ static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
struct vhost_iotlb *iommu;
int i;
if (group > vdpasim->dev_attr.ngroups)
return -EINVAL;
if (asid >= vdpasim->dev_attr.nas)
return -EINVAL;
iommu = &vdpasim->iommu[asid];
mutex_lock(&vdpasim->mutex);

View File

@@ -493,17 +493,15 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
vduse_domain_free_iova(iovad, dma_addr, size);
}
void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
size_t size, dma_addr_t *dma_addr,
gfp_t flag)
dma_addr_t vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
size_t size, void *orig)
{
struct iova_domain *iovad = &domain->consistent_iovad;
unsigned long limit = domain->iova_limit;
dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit);
void *orig = alloc_pages_exact(size, flag);
if (!iova || !orig)
goto err;
if (!iova)
return DMA_MAPPING_ERROR;
spin_lock(&domain->iotlb_lock);
if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1,
@@ -514,27 +512,20 @@ void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
}
spin_unlock(&domain->iotlb_lock);
*dma_addr = iova;
return iova;
return orig;
err:
*dma_addr = DMA_MAPPING_ERROR;
if (orig)
free_pages_exact(orig, size);
if (iova)
vduse_domain_free_iova(iovad, iova, size);
vduse_domain_free_iova(iovad, iova, size);
return NULL;
return DMA_MAPPING_ERROR;
}
void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
void *vaddr, dma_addr_t dma_addr,
unsigned long attrs)
dma_addr_t dma_addr, unsigned long attrs)
{
struct iova_domain *iovad = &domain->consistent_iovad;
struct vhost_iotlb_map *map;
struct vdpa_map_file *map_file;
phys_addr_t pa;
spin_lock(&domain->iotlb_lock);
map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr,
@@ -546,12 +537,10 @@ void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
map_file = (struct vdpa_map_file *)map->opaque;
fput(map_file->file);
kfree(map_file);
pa = map->addr;
vhost_iotlb_map_free(domain->iotlb, map);
spin_unlock(&domain->iotlb_lock);
vduse_domain_free_iova(iovad, dma_addr, size);
free_pages_exact(phys_to_virt(pa), size);
}
static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf)

View File

@@ -65,13 +65,11 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain,
dma_addr_t dma_addr, size_t size,
enum dma_data_direction dir, unsigned long attrs);
void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
size_t size, dma_addr_t *dma_addr,
gfp_t flag);
dma_addr_t vduse_domain_alloc_coherent(struct vduse_iova_domain *domain,
size_t size, void *orig);
void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size,
void *vaddr, dma_addr_t dma_addr,
unsigned long attrs);
dma_addr_t dma_addr, unsigned long attrs);
void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain);

View File

@@ -9,6 +9,7 @@
*/
#include "linux/virtio_net.h"
#include <linux/cleanup.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cdev.h>
@@ -22,6 +23,7 @@
#include <linux/uio.h>
#include <linux/vdpa.h>
#include <linux/nospec.h>
#include <linux/virtio.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include <uapi/linux/vduse.h>
@@ -39,6 +41,8 @@
#define DRV_LICENSE "GPL v2"
#define VDUSE_DEV_MAX (1U << MINORBITS)
#define VDUSE_DEV_MAX_GROUPS 0xffff
#define VDUSE_DEV_MAX_AS 0xffff
#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
@@ -48,6 +52,15 @@
#define IRQ_UNBOUND -1
/*
* VDUSE instance have not asked the vduse API version, so assume 0.
*
* Old devices may not ask for the device version and assume it is 0. Keep
* this value for these. From the moment the VDUSE instance ask for the
* version, convert to the latests supported one and continue regular flow
*/
#define VDUSE_API_VERSION_NOT_ASKED U64_MAX
struct vduse_virtqueue {
u16 index;
u16 num_max;
@@ -58,6 +71,7 @@ struct vduse_virtqueue {
struct vdpa_vq_state state;
bool ready;
bool kicked;
u32 group;
spinlock_t kick_lock;
spinlock_t irq_lock;
struct eventfd_ctx *kickfd;
@@ -83,11 +97,23 @@ struct vduse_umem {
struct mm_struct *mm;
};
struct vduse_as {
struct vduse_iova_domain *domain;
struct vduse_umem *umem;
struct mutex mem_lock;
};
struct vduse_vq_group {
rwlock_t as_lock;
struct vduse_as *as; /* Protected by as_lock */
struct vduse_dev *dev;
};
struct vduse_dev {
struct vduse_vdpa *vdev;
struct device *dev;
struct vduse_virtqueue **vqs;
struct vduse_iova_domain *domain;
struct vduse_as *as;
char *name;
struct mutex lock;
spinlock_t msg_lock;
@@ -114,8 +140,9 @@ struct vduse_dev {
u8 status;
u32 vq_num;
u32 vq_align;
struct vduse_umem *umem;
struct mutex mem_lock;
u32 ngroups;
u32 nas;
struct vduse_vq_group *groups;
unsigned int bounce_size;
struct mutex domain_lock;
};
@@ -305,7 +332,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status)
return vduse_dev_msg_sync(dev, &msg);
}
static int vduse_dev_update_iotlb(struct vduse_dev *dev,
static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid,
u64 start, u64 last)
{
struct vduse_dev_msg msg = { 0 };
@@ -314,8 +341,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev,
return -EINVAL;
msg.req.type = VDUSE_UPDATE_IOTLB;
msg.req.iova.start = start;
msg.req.iova.last = last;
if (dev->api_version < VDUSE_API_VERSION_1) {
msg.req.iova.start = start;
msg.req.iova.last = last;
} else {
msg.req.iova_v2.start = start;
msg.req.iova_v2.last = last;
msg.req.iova_v2.asid = asid;
}
return vduse_dev_msg_sync(dev, &msg);
}
@@ -430,11 +463,14 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait)
static void vduse_dev_reset(struct vduse_dev *dev)
{
int i;
struct vduse_iova_domain *domain = dev->domain;
/* The coherent mappings are handled in vduse_dev_free_coherent() */
if (domain && domain->bounce_map)
vduse_domain_reset_bounce_map(domain);
for (i = 0; i < dev->nas; i++) {
struct vduse_iova_domain *domain = dev->as[i].domain;
if (domain && domain->bounce_map)
vduse_domain_reset_bounce_map(domain);
}
down_write(&dev->rwsem);
@@ -592,6 +628,63 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
return 0;
}
static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
if (dev->api_version < VDUSE_API_VERSION_1)
return 0;
return dev->vqs[idx]->group;
}
static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
u32 vq_group = vduse_get_vq_group(vdpa, idx);
union virtio_map ret = {
.group = &dev->groups[vq_group],
};
return ret;
}
DEFINE_GUARD(vq_group_as_read_lock, struct vduse_vq_group *,
if (_T->dev->nas > 1)
read_lock(&_T->as_lock),
if (_T->dev->nas > 1)
read_unlock(&_T->as_lock))
DEFINE_GUARD(vq_group_as_write_lock, struct vduse_vq_group *,
if (_T->dev->nas > 1)
write_lock(&_T->as_lock),
if (_T->dev->nas > 1)
write_unlock(&_T->as_lock))
static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group,
unsigned int asid)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
struct vduse_dev_msg msg = { 0 };
int r;
if (dev->api_version < VDUSE_API_VERSION_1)
return -EINVAL;
msg.req.type = VDUSE_SET_VQ_GROUP_ASID;
msg.req.vq_group_asid.group = group;
msg.req.vq_group_asid.asid = asid;
r = vduse_dev_msg_sync(dev, &msg);
if (r < 0)
return r;
guard(vq_group_as_write_lock)(&dev->groups[group]);
dev->groups[group].as = &dev->as[asid];
return 0;
}
static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
struct vdpa_vq_state *state)
{
@@ -763,13 +856,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
int ret;
ret = vduse_domain_set_map(dev->domain, iotlb);
ret = vduse_domain_set_map(dev->as[asid].domain, iotlb);
if (ret)
return ret;
ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX);
ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX);
if (ret) {
vduse_domain_clear_map(dev->domain, iotlb);
vduse_domain_clear_map(dev->as[asid].domain, iotlb);
return ret;
}
@@ -789,6 +882,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.set_vq_cb = vduse_vdpa_set_vq_cb,
.set_vq_num = vduse_vdpa_set_vq_num,
.get_vq_size = vduse_vdpa_get_vq_size,
.get_vq_group = vduse_get_vq_group,
.set_vq_ready = vduse_vdpa_set_vq_ready,
.get_vq_ready = vduse_vdpa_get_vq_ready,
.set_vq_state = vduse_vdpa_set_vq_state,
@@ -811,6 +905,8 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.get_vq_affinity = vduse_vdpa_get_vq_affinity,
.reset = vduse_vdpa_reset,
.set_map = vduse_vdpa_set_map,
.set_group_asid = vduse_set_group_asid,
.get_vq_map = vduse_get_vq_map,
.free = vduse_vdpa_free,
};
@@ -818,8 +914,13 @@ static void vduse_dev_sync_single_for_device(union virtio_map token,
dma_addr_t dma_addr, size_t size,
enum dma_data_direction dir)
{
struct vduse_iova_domain *domain = token.iova_domain;
struct vduse_iova_domain *domain;
if (!token.group)
return;
guard(vq_group_as_read_lock)(token.group);
domain = token.group->as->domain;
vduse_domain_sync_single_for_device(domain, dma_addr, size, dir);
}
@@ -827,8 +928,13 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token,
dma_addr_t dma_addr, size_t size,
enum dma_data_direction dir)
{
struct vduse_iova_domain *domain = token.iova_domain;
struct vduse_iova_domain *domain;
if (!token.group)
return;
guard(vq_group_as_read_lock)(token.group);
domain = token.group->as->domain;
vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir);
}
@@ -837,8 +943,13 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page,
enum dma_data_direction dir,
unsigned long attrs)
{
struct vduse_iova_domain *domain = token.iova_domain;
struct vduse_iova_domain *domain;
if (!token.group)
return DMA_MAPPING_ERROR;
guard(vq_group_as_read_lock)(token.group);
domain = token.group->as->domain;
return vduse_domain_map_page(domain, page, offset, size, dir, attrs);
}
@@ -846,43 +957,71 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
struct vduse_iova_domain *domain = token.iova_domain;
struct vduse_iova_domain *domain;
return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
if (!token.group)
return;
guard(vq_group_as_read_lock)(token.group);
domain = token.group->as->domain;
vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs);
}
static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
struct vduse_iova_domain *domain = token.iova_domain;
unsigned long iova;
void *addr;
*dma_addr = DMA_MAPPING_ERROR;
addr = vduse_domain_alloc_coherent(domain, size,
(dma_addr_t *)&iova, flag);
if (!token.group)
return NULL;
addr = alloc_pages_exact(size, flag);
if (!addr)
return NULL;
*dma_addr = (dma_addr_t)iova;
{
struct vduse_iova_domain *domain;
guard(vq_group_as_read_lock)(token.group);
domain = token.group->as->domain;
*dma_addr = vduse_domain_alloc_coherent(domain, size, addr);
if (*dma_addr == DMA_MAPPING_ERROR)
goto err;
}
return addr;
err:
free_pages_exact(addr, size);
return NULL;
}
static void vduse_dev_free_coherent(union virtio_map token, size_t size,
void *vaddr, dma_addr_t dma_addr,
unsigned long attrs)
{
struct vduse_iova_domain *domain = token.iova_domain;
if (!token.group)
return;
vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs);
{
struct vduse_iova_domain *domain;
guard(vq_group_as_read_lock)(token.group);
domain = token.group->as->domain;
vduse_domain_free_coherent(domain, size, dma_addr, attrs);
}
free_pages_exact(vaddr, size);
}
static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr)
{
struct vduse_iova_domain *domain = token.iova_domain;
if (!token.group)
return false;
return dma_addr < domain->bounce_size;
guard(vq_group_as_read_lock)(token.group);
return dma_addr < token.group->as->domain->bounce_size;
}
static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
@@ -894,9 +1033,11 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr)
static size_t vduse_dev_max_mapping_size(union virtio_map token)
{
struct vduse_iova_domain *domain = token.iova_domain;
if (!token.group)
return 0;
return domain->bounce_size;
guard(vq_group_as_read_lock)(token.group);
return token.group->as->domain->bounce_size;
}
static const struct virtio_map_ops vduse_map_ops = {
@@ -1036,39 +1177,40 @@ unlock:
return ret;
}
static int vduse_dev_dereg_umem(struct vduse_dev *dev,
static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid,
u64 iova, u64 size)
{
int ret;
mutex_lock(&dev->mem_lock);
mutex_lock(&dev->as[asid].mem_lock);
ret = -ENOENT;
if (!dev->umem)
if (!dev->as[asid].umem)
goto unlock;
ret = -EINVAL;
if (!dev->domain)
if (!dev->as[asid].domain)
goto unlock;
if (dev->umem->iova != iova || size != dev->domain->bounce_size)
if (dev->as[asid].umem->iova != iova ||
size != dev->as[asid].domain->bounce_size)
goto unlock;
vduse_domain_remove_user_bounce_pages(dev->domain);
unpin_user_pages_dirty_lock(dev->umem->pages,
dev->umem->npages, true);
atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
mmdrop(dev->umem->mm);
vfree(dev->umem->pages);
kfree(dev->umem);
dev->umem = NULL;
vduse_domain_remove_user_bounce_pages(dev->as[asid].domain);
unpin_user_pages_dirty_lock(dev->as[asid].umem->pages,
dev->as[asid].umem->npages, true);
atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm);
mmdrop(dev->as[asid].umem->mm);
vfree(dev->as[asid].umem->pages);
kfree(dev->as[asid].umem);
dev->as[asid].umem = NULL;
ret = 0;
unlock:
mutex_unlock(&dev->mem_lock);
mutex_unlock(&dev->as[asid].mem_lock);
return ret;
}
static int vduse_dev_reg_umem(struct vduse_dev *dev,
u64 iova, u64 uaddr, u64 size)
u32 asid, u64 iova, u64 uaddr, u64 size)
{
struct page **page_list = NULL;
struct vduse_umem *umem = NULL;
@@ -1076,14 +1218,14 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
unsigned long npages, lock_limit;
int ret;
if (!dev->domain || !dev->domain->bounce_map ||
size != dev->domain->bounce_size ||
if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map ||
size != dev->as[asid].domain->bounce_size ||
iova != 0 || uaddr & ~PAGE_MASK)
return -EINVAL;
mutex_lock(&dev->mem_lock);
mutex_lock(&dev->as[asid].mem_lock);
ret = -EEXIST;
if (dev->umem)
if (dev->as[asid].umem)
goto unlock;
ret = -ENOMEM;
@@ -1107,7 +1249,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
goto out;
}
ret = vduse_domain_add_user_bounce_pages(dev->domain,
ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain,
page_list, pinned);
if (ret)
goto out;
@@ -1120,7 +1262,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
umem->mm = current->mm;
mmgrab(current->mm);
dev->umem = umem;
dev->as[asid].umem = umem;
out:
if (ret && pinned > 0)
unpin_user_pages(page_list, pinned);
@@ -1131,7 +1273,7 @@ unlock:
vfree(page_list);
kfree(umem);
}
mutex_unlock(&dev->mem_lock);
mutex_unlock(&dev->as[asid].mem_lock);
return ret;
}
@@ -1151,6 +1293,54 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
vq->irq_effective_cpu = curr_cpu;
}
static int vduse_dev_iotlb_entry(struct vduse_dev *dev,
struct vduse_iotlb_entry_v2 *entry,
struct file **f, uint64_t *capability)
{
u32 asid;
int r = -EINVAL;
struct vhost_iotlb_map *map;
if (entry->start > entry->last || entry->asid >= dev->nas)
return -EINVAL;
asid = array_index_nospec(entry->asid, dev->nas);
mutex_lock(&dev->domain_lock);
if (!dev->as[asid].domain)
goto out;
spin_lock(&dev->as[asid].domain->iotlb_lock);
map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb,
entry->start, entry->last);
if (map) {
if (f) {
const struct vdpa_map_file *map_file;
map_file = (struct vdpa_map_file *)map->opaque;
entry->offset = map_file->offset;
*f = get_file(map_file->file);
}
entry->start = map->start;
entry->last = map->last;
entry->perm = map->perm;
if (capability) {
*capability = 0;
if (dev->as[asid].domain->bounce_map && map->start == 0 &&
map->last == dev->as[asid].domain->bounce_size - 1)
*capability |= VDUSE_IOVA_CAP_UMEM;
}
r = 0;
}
spin_unlock(&dev->as[asid].domain->iotlb_lock);
out:
mutex_unlock(&dev->domain_lock);
return r;
}
static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -1162,44 +1352,36 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
return -EPERM;
switch (cmd) {
case VDUSE_IOTLB_GET_FD: {
struct vduse_iotlb_entry entry;
struct vhost_iotlb_map *map;
struct vdpa_map_file *map_file;
case VDUSE_IOTLB_GET_FD:
case VDUSE_IOTLB_GET_FD2: {
struct vduse_iotlb_entry_v2 entry = {0};
struct file *f = NULL;
ret = -ENOIOCTLCMD;
if (dev->api_version < VDUSE_API_VERSION_1 &&
cmd == VDUSE_IOTLB_GET_FD2)
break;
ret = -EFAULT;
if (copy_from_user(&entry, argp, sizeof(entry)))
if (copy_from_user(&entry, argp, _IOC_SIZE(cmd)))
break;
ret = -EINVAL;
if (entry.start > entry.last)
if (!is_mem_zero((const char *)entry.reserved,
sizeof(entry.reserved)))
break;
mutex_lock(&dev->domain_lock);
if (!dev->domain) {
mutex_unlock(&dev->domain_lock);
ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL);
if (ret)
break;
}
spin_lock(&dev->domain->iotlb_lock);
map = vhost_iotlb_itree_first(dev->domain->iotlb,
entry.start, entry.last);
if (map) {
map_file = (struct vdpa_map_file *)map->opaque;
f = get_file(map_file->file);
entry.offset = map_file->offset;
entry.start = map->start;
entry.last = map->last;
entry.perm = map->perm;
}
spin_unlock(&dev->domain->iotlb_lock);
mutex_unlock(&dev->domain_lock);
ret = -EINVAL;
if (!f)
break;
ret = -EFAULT;
if (copy_to_user(argp, &entry, sizeof(entry))) {
ret = copy_to_user(argp, &entry, _IOC_SIZE(cmd));
if (ret) {
ret = -EFAULT;
fput(f);
break;
}
@@ -1252,12 +1434,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
if (config.index >= dev->vq_num)
break;
if (!is_mem_zero((const char *)config.reserved,
sizeof(config.reserved)))
if (dev->api_version < VDUSE_API_VERSION_1) {
if (config.group)
break;
} else {
if (config.group >= dev->ngroups)
break;
if (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)
break;
}
if (config.reserved1 ||
!is_mem_zero((const char *)config.reserved2,
sizeof(config.reserved2)))
break;
index = array_index_nospec(config.index, dev->vq_num);
dev->vqs[index]->num_max = config.max_size;
dev->vqs[index]->group = config.group;
ret = 0;
break;
}
@@ -1336,6 +1530,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
}
case VDUSE_IOTLB_REG_UMEM: {
struct vduse_iova_umem umem;
u32 asid;
ret = -EFAULT;
if (copy_from_user(&umem, argp, sizeof(umem)))
@@ -1343,17 +1538,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
ret = -EINVAL;
if (!is_mem_zero((const char *)umem.reserved,
sizeof(umem.reserved)))
sizeof(umem.reserved)) ||
(dev->api_version < VDUSE_API_VERSION_1 &&
umem.asid != 0) || umem.asid >= dev->nas)
break;
mutex_lock(&dev->domain_lock);
ret = vduse_dev_reg_umem(dev, umem.iova,
asid = array_index_nospec(umem.asid, dev->nas);
ret = vduse_dev_reg_umem(dev, asid, umem.iova,
umem.uaddr, umem.size);
mutex_unlock(&dev->domain_lock);
break;
}
case VDUSE_IOTLB_DEREG_UMEM: {
struct vduse_iova_umem umem;
u32 asid;
ret = -EFAULT;
if (copy_from_user(&umem, argp, sizeof(umem)))
@@ -1361,51 +1560,49 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
ret = -EINVAL;
if (!is_mem_zero((const char *)umem.reserved,
sizeof(umem.reserved)))
sizeof(umem.reserved)) ||
(dev->api_version < VDUSE_API_VERSION_1 &&
umem.asid != 0) ||
umem.asid >= dev->nas)
break;
mutex_lock(&dev->domain_lock);
ret = vduse_dev_dereg_umem(dev, umem.iova,
asid = array_index_nospec(umem.asid, dev->nas);
ret = vduse_dev_dereg_umem(dev, asid, umem.iova,
umem.size);
mutex_unlock(&dev->domain_lock);
break;
}
case VDUSE_IOTLB_GET_INFO: {
struct vduse_iova_info info;
struct vhost_iotlb_map *map;
struct vduse_iotlb_entry_v2 entry;
ret = -EFAULT;
if (copy_from_user(&info, argp, sizeof(info)))
break;
ret = -EINVAL;
if (info.start > info.last)
break;
if (!is_mem_zero((const char *)info.reserved,
sizeof(info.reserved)))
break;
mutex_lock(&dev->domain_lock);
if (!dev->domain) {
mutex_unlock(&dev->domain_lock);
if (dev->api_version < VDUSE_API_VERSION_1) {
if (info.asid)
break;
} else if (info.asid >= dev->nas)
break;
}
spin_lock(&dev->domain->iotlb_lock);
map = vhost_iotlb_itree_first(dev->domain->iotlb,
info.start, info.last);
if (map) {
info.start = map->start;
info.last = map->last;
info.capability = 0;
if (dev->domain->bounce_map && map->start == 0 &&
map->last == dev->domain->bounce_size - 1)
info.capability |= VDUSE_IOVA_CAP_UMEM;
}
spin_unlock(&dev->domain->iotlb_lock);
mutex_unlock(&dev->domain_lock);
if (!map)
entry.start = info.start;
entry.last = info.last;
entry.asid = info.asid;
ret = vduse_dev_iotlb_entry(dev, &entry, NULL,
&info.capability);
if (ret < 0)
break;
info.start = entry.start;
info.last = entry.last;
info.asid = entry.asid;
ret = -EFAULT;
if (copy_to_user(argp, &info, sizeof(info)))
break;
@@ -1426,8 +1623,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
struct vduse_dev *dev = file->private_data;
mutex_lock(&dev->domain_lock);
if (dev->domain)
vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
for (int i = 0; i < dev->nas; i++)
if (dev->as[i].domain)
vduse_dev_dereg_umem(dev, i, 0,
dev->as[i].domain->bounce_size);
mutex_unlock(&dev->domain_lock);
spin_lock(&dev->msg_lock);
/* Make sure the inflight messages can processed after reconncection */
@@ -1646,7 +1845,6 @@ static struct vduse_dev *vduse_dev_create(void)
return NULL;
mutex_init(&dev->lock);
mutex_init(&dev->mem_lock);
mutex_init(&dev->domain_lock);
spin_lock_init(&dev->msg_lock);
INIT_LIST_HEAD(&dev->send_list);
@@ -1697,9 +1895,13 @@ static int vduse_destroy_dev(char *name)
idr_remove(&vduse_idr, dev->minor);
kvfree(dev->config);
vduse_dev_deinit_vqs(dev);
if (dev->domain)
vduse_domain_destroy(dev->domain);
for (int i = 0; i < dev->nas; i++) {
if (dev->as[i].domain)
vduse_domain_destroy(dev->as[i].domain);
}
kfree(dev->as);
kfree(dev->name);
kfree(dev->groups);
vduse_dev_destroy(dev);
module_put(THIS_MODULE);
@@ -1737,12 +1939,25 @@ static bool features_is_valid(struct vduse_dev_config *config)
return true;
}
static bool vduse_validate_config(struct vduse_dev_config *config)
static bool vduse_validate_config(struct vduse_dev_config *config,
u64 api_version)
{
if (!is_mem_zero((const char *)config->reserved,
sizeof(config->reserved)))
return false;
if (api_version < VDUSE_API_VERSION_1 &&
(config->ngroups || config->nas))
return false;
if (api_version >= VDUSE_API_VERSION_1) {
if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS)
return false;
if (!config->nas || config->nas > VDUSE_DEV_MAX_AS)
return false;
}
if (config->vq_align > PAGE_SIZE)
return false;
@@ -1806,7 +2021,8 @@ static ssize_t bounce_size_store(struct device *device,
ret = -EPERM;
mutex_lock(&dev->domain_lock);
if (dev->domain)
/* Assuming that if the first domain is allocated, all are allocated */
if (dev->as[0].domain)
goto unlock;
ret = kstrtouint(buf, 10, &bounce_size);
@@ -1858,6 +2074,27 @@ static int vduse_create_dev(struct vduse_dev_config *config,
dev->device_features = config->features;
dev->device_id = config->device_id;
dev->vendor_id = config->vendor_id;
dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas;
dev->as = kcalloc(dev->nas, sizeof(dev->as[0]), GFP_KERNEL);
if (!dev->as)
goto err_as;
for (int i = 0; i < dev->nas; i++)
mutex_init(&dev->as[i].mem_lock);
dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1)
? 1
: config->ngroups;
dev->groups = kcalloc(dev->ngroups, sizeof(dev->groups[0]),
GFP_KERNEL);
if (!dev->groups)
goto err_vq_groups;
for (u32 i = 0; i < dev->ngroups; ++i) {
dev->groups[i].dev = dev;
rwlock_init(&dev->groups[i].as_lock);
dev->groups[i].as = &dev->as[0];
}
dev->name = kstrdup(config->name, GFP_KERNEL);
if (!dev->name)
goto err_str;
@@ -1894,6 +2131,10 @@ err_dev:
err_idr:
kfree(dev->name);
err_str:
kfree(dev->groups);
err_vq_groups:
kfree(dev->as);
err_as:
vduse_dev_destroy(dev);
err:
return ret;
@@ -1909,6 +2150,8 @@ static long vduse_ioctl(struct file *file, unsigned int cmd,
mutex_lock(&vduse_lock);
switch (cmd) {
case VDUSE_GET_API_VERSION:
if (control->api_version == VDUSE_API_VERSION_NOT_ASKED)
control->api_version = VDUSE_API_VERSION_1;
ret = put_user(control->api_version, (u64 __user *)argp);
break;
case VDUSE_SET_API_VERSION: {
@@ -1919,7 +2162,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd,
break;
ret = -EINVAL;
if (api_version > VDUSE_API_VERSION)
if (api_version > VDUSE_API_VERSION_1)
break;
ret = 0;
@@ -1936,7 +2179,9 @@ static long vduse_ioctl(struct file *file, unsigned int cmd,
break;
ret = -EINVAL;
if (vduse_validate_config(&config) == false)
if (control->api_version == VDUSE_API_VERSION_NOT_ASKED)
control->api_version = VDUSE_API_VERSION;
if (!vduse_validate_config(&config, control->api_version))
break;
buf = vmemdup_user(argp + size, config.config_size);
@@ -1986,7 +2231,7 @@ static int vduse_open(struct inode *inode, struct file *file)
if (!control)
return -ENOMEM;
control->api_version = VDUSE_API_VERSION;
control->api_version = VDUSE_API_VERSION_NOT_ASKED;
file->private_data = control;
return 0;
@@ -2017,7 +2262,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev,
&vduse_vdpa_config_ops, &vduse_map_ops,
1, 1, name, true);
dev->ngroups, dev->nas, name, true);
if (IS_ERR(vdev))
return PTR_ERR(vdev);
@@ -2032,7 +2277,8 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
const struct vdpa_dev_set_config *config)
{
struct vduse_dev *dev;
int ret;
size_t domain_bounce_size;
int ret, i;
mutex_lock(&vduse_lock);
dev = vduse_find_dev(name);
@@ -2046,27 +2292,41 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
return ret;
mutex_lock(&dev->domain_lock);
if (!dev->domain)
dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
dev->bounce_size);
mutex_unlock(&dev->domain_lock);
if (!dev->domain) {
put_device(&dev->vdev->vdpa.dev);
return -ENOMEM;
ret = 0;
domain_bounce_size = dev->bounce_size / dev->nas;
for (i = 0; i < dev->nas; ++i) {
dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
domain_bounce_size);
if (!dev->as[i].domain) {
ret = -ENOMEM;
goto err;
}
}
dev->vdev->vdpa.vmap.iova_domain = dev->domain;
mutex_unlock(&dev->domain_lock);
ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
if (ret) {
put_device(&dev->vdev->vdpa.dev);
mutex_lock(&dev->domain_lock);
vduse_domain_destroy(dev->domain);
dev->domain = NULL;
mutex_unlock(&dev->domain_lock);
return ret;
}
if (ret)
goto err_register;
return 0;
err_register:
mutex_lock(&dev->domain_lock);
err:
for (int j = 0; j < i; j++) {
if (dev->as[j].domain) {
vduse_domain_destroy(dev->as[j].domain);
dev->as[j].domain = NULL;
}
}
mutex_unlock(&dev->domain_lock);
put_device(&dev->vdev->vdpa.dev);
return ret;
}
static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev)

View File

@@ -680,8 +680,10 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
case VHOST_VDPA_SET_GROUP_ASID:
if (copy_from_user(&s, argp, sizeof(s)))
return -EFAULT;
if (s.num >= vdpa->nas)
if (idx >= vdpa->ngroups || s.num >= vdpa->nas)
return -EINVAL;
if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK)
return -EBUSY;
if (!ops->set_group_asid)
return -EOPNOTSUPP;
return ops->set_group_asid(vdpa, idx, s.num);
@@ -1527,6 +1529,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma)
if (vma->vm_end - vma->vm_start != notify.size)
return -ENOTSUPP;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &vhost_vdpa_vm_ops;
return 0;

View File

@@ -1444,13 +1444,13 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
({ \
int ret; \
if (!vq->iotlb) { \
ret = __put_user(x, ptr); \
ret = put_user(x, ptr); \
} else { \
__typeof__(ptr) to = \
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
sizeof(*ptr), VHOST_ADDR_USED); \
if (to != NULL) \
ret = __put_user(x, to); \
ret = put_user(x, to); \
else \
ret = -EFAULT; \
} \
@@ -1489,14 +1489,14 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
({ \
int ret; \
if (!vq->iotlb) { \
ret = __get_user(x, ptr); \
ret = get_user(x, ptr); \
} else { \
__typeof__(ptr) from = \
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
sizeof(*ptr), \
type); \
if (from != NULL) \
ret = __get_user(x, from); \
ret = get_user(x, from); \
else \
ret = -EFAULT; \
} \

View File

@@ -4,6 +4,7 @@
#include <linux/virtio_config.h>
#include <linux/input.h>
#include <linux/slab.h>
#include <linux/dma-mapping.h>
#include <uapi/linux/virtio_ids.h>
#include <uapi/linux/virtio_input.h>
@@ -16,7 +17,9 @@ struct virtio_input {
char serial[64];
char phys[64];
struct virtqueue *evt, *sts;
__dma_from_device_group_begin();
struct virtio_input_event evts[64];
__dma_from_device_group_end();
spinlock_t lock;
bool ready;
};
@@ -27,7 +30,7 @@ static void virtinput_queue_evtbuf(struct virtio_input *vi,
struct scatterlist sg[1];
sg_init_one(sg, evtbuf, sizeof(*evtbuf));
virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC);
virtqueue_add_inbuf_cache_clean(vi->evt, sg, 1, evtbuf, GFP_ATOMIC);
}
static void virtinput_recv_events(struct virtqueue *vq)

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,7 @@
#include <linux/dma-direction.h>
#include <linux/scatterlist.h>
#include <linux/bug.h>
#include <linux/cache.h>
/**
* List of possible attributes associated with a DMA mapping. The semantics
@@ -78,6 +79,13 @@
*/
#define DMA_ATTR_MMIO (1UL << 10)
/*
* DMA_ATTR_CPU_CACHE_CLEAN: Indicates the CPU will not dirty any cacheline
* overlapping this buffer while it is mapped for DMA. All mappings sharing
* a cacheline must have this attribute for this to be considered safe.
*/
#define DMA_ATTR_CPU_CACHE_CLEAN (1UL << 11)
/*
* A dma_addr_t can hold any valid DMA or bus address for the platform. It can
* be given to a device to use as a DMA source or target. It is specific to a
@@ -703,6 +711,18 @@ static inline int dma_get_cache_alignment(void)
}
#endif
#ifdef ARCH_HAS_DMA_MINALIGN
#define ____dma_from_device_aligned __aligned(ARCH_DMA_MINALIGN)
#else
#define ____dma_from_device_aligned
#endif
/* Mark start of DMA buffer */
#define __dma_from_device_group_begin(GROUP) \
__cacheline_group_begin(GROUP) ____dma_from_device_aligned
/* Mark end of DMA buffer */
#define __dma_from_device_group_end(GROUP) \
__cacheline_group_end(GROUP) ____dma_from_device_aligned
static inline void *dmam_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{

View File

@@ -312,7 +312,9 @@ struct vdpa_map_file {
* @idx: virtqueue index
* Returns the affinity mask
* @set_group_asid: Set address space identifier for a
* virtqueue group (optional)
* virtqueue group (optional). Caller must
* prevent this from being executed concurrently
* with set_status.
* @vdev: vdpa device
* @group: virtqueue group
* @asid: address space id for this group

View File

@@ -43,13 +43,13 @@ struct virtqueue {
void *priv;
};
struct vduse_iova_domain;
struct vduse_vq_group;
union virtio_map {
/* Device that performs DMA */
struct device *dma_dev;
/* VDUSE specific mapping data */
struct vduse_iova_domain *iova_domain;
/* VDUSE specific virtqueue group for doing map */
struct vduse_vq_group *group;
};
int virtqueue_add_outbuf(struct virtqueue *vq,
@@ -62,6 +62,11 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
void *data,
gfp_t gfp);
int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,

View File

@@ -10,6 +10,10 @@
#define VDUSE_API_VERSION 0
/* VQ groups and ASID support */
#define VDUSE_API_VERSION_1 1
/*
* Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION).
* This is used for future extension.
@@ -27,6 +31,8 @@
* @features: virtio features
* @vq_num: the number of virtqueues
* @vq_align: the allocation alignment of virtqueue's metadata
* @ngroups: number of vq groups that VDUSE device declares
* @nas: number of address spaces that VDUSE device declares
* @reserved: for future use, needs to be initialized to zero
* @config_size: the size of the configuration space
* @config: the buffer of the configuration space
@@ -41,7 +47,9 @@ struct vduse_dev_config {
__u64 features;
__u32 vq_num;
__u32 vq_align;
__u32 reserved[13];
__u32 ngroups; /* if VDUSE_API_VERSION >= 1 */
__u32 nas; /* if VDUSE_API_VERSION >= 1 */
__u32 reserved[11];
__u32 config_size;
__u8 config[];
};
@@ -118,14 +126,18 @@ struct vduse_config_data {
* struct vduse_vq_config - basic configuration of a virtqueue
* @index: virtqueue index
* @max_size: the max size of virtqueue
* @reserved: for future use, needs to be initialized to zero
* @reserved1: for future use, needs to be initialized to zero
* @group: virtqueue group
* @reserved2: for future use, needs to be initialized to zero
*
* Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue.
*/
struct vduse_vq_config {
__u32 index;
__u16 max_size;
__u16 reserved[13];
__u16 reserved1;
__u32 group;
__u16 reserved2[10];
};
/*
@@ -156,6 +168,16 @@ struct vduse_vq_state_packed {
__u16 last_used_idx;
};
/**
* struct vduse_vq_group_asid - virtqueue group ASID
* @group: Index of the virtqueue group
* @asid: Address space ID of the group
*/
struct vduse_vq_group_asid {
__u32 group;
__u32 asid;
};
/**
* struct vduse_vq_info - information of a virtqueue
* @index: virtqueue index
@@ -215,6 +237,7 @@ struct vduse_vq_eventfd {
* @uaddr: start address of userspace memory, it must be aligned to page size
* @iova: start of the IOVA region
* @size: size of the IOVA region
* @asid: Address space ID of the IOVA region
* @reserved: for future use, needs to be initialized to zero
*
* Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM
@@ -224,7 +247,8 @@ struct vduse_iova_umem {
__u64 uaddr;
__u64 iova;
__u64 size;
__u64 reserved[3];
__u32 asid;
__u32 reserved[5];
};
/* Register userspace memory for IOVA regions */
@@ -238,6 +262,7 @@ struct vduse_iova_umem {
* @start: start of the IOVA region
* @last: last of the IOVA region
* @capability: capability of the IOVA region
* @asid: Address space ID of the IOVA region, only if device API version >= 1
* @reserved: for future use, needs to be initialized to zero
*
* Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of
@@ -248,7 +273,8 @@ struct vduse_iova_info {
__u64 last;
#define VDUSE_IOVA_CAP_UMEM (1 << 0)
__u64 capability;
__u64 reserved[3];
__u32 asid; /* Only if device API version >= 1 */
__u32 reserved[5];
};
/*
@@ -257,6 +283,32 @@ struct vduse_iova_info {
*/
#define VDUSE_IOTLB_GET_INFO _IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info)
/**
* struct vduse_iotlb_entry_v2 - entry of IOTLB to describe one IOVA region
*
* @v1: the original vduse_iotlb_entry
* @asid: address space ID of the IOVA region
* @reserved: for future use, needs to be initialized to zero
*
* Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region.
*/
struct vduse_iotlb_entry_v2 {
__u64 offset;
__u64 start;
__u64 last;
__u8 perm;
__u8 padding[7];
__u32 asid;
__u32 reserved[11];
};
/*
* Same as VDUSE_IOTLB_GET_FD but with vduse_iotlb_entry_v2 argument that
* support extra fields.
*/
#define VDUSE_IOTLB_GET_FD2 _IOWR(VDUSE_BASE, 0x1b, struct vduse_iotlb_entry_v2)
/* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
/**
@@ -265,11 +317,14 @@ struct vduse_iova_info {
* @VDUSE_SET_STATUS: set the device status
* @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for
* specified IOVA range via VDUSE_IOTLB_GET_FD ioctl
* @VDUSE_SET_VQ_GROUP_ASID: Notify userspace to update the address space of a
* virtqueue group.
*/
enum vduse_req_type {
VDUSE_GET_VQ_STATE,
VDUSE_SET_STATUS,
VDUSE_UPDATE_IOTLB,
VDUSE_SET_VQ_GROUP_ASID,
};
/**
@@ -304,6 +359,19 @@ struct vduse_iova_range {
__u64 last;
};
/**
* struct vduse_iova_range_v2 - IOVA range [start, last] if API_VERSION >= 1
* @start: start of the IOVA range
* @last: last of the IOVA range
* @asid: address space ID of the IOVA range
*/
struct vduse_iova_range_v2 {
__u64 start;
__u64 last;
__u32 asid;
__u32 padding;
};
/**
* struct vduse_dev_request - control request
* @type: request type
@@ -312,6 +380,8 @@ struct vduse_iova_range {
* @vq_state: virtqueue state, only index field is available
* @s: device status
* @iova: IOVA range for updating
* @iova_v2: IOVA range for updating if API_VERSION >= 1
* @vq_group_asid: ASID of a virtqueue group
* @padding: padding
*
* Structure used by read(2) on /dev/vduse/$NAME.
@@ -324,6 +394,11 @@ struct vduse_dev_request {
struct vduse_vq_state vq_state;
struct vduse_dev_status s;
struct vduse_iova_range iova;
/* Following members but padding exist only if vduse api
* version >= 1
*/
struct vduse_iova_range_v2 iova_v2;
struct vduse_vq_group_asid vq_group_asid;
__u32 padding[32];
};
};

View File

@@ -31,9 +31,6 @@
* SUCH DAMAGE.
*
* Copyright Rusty Russell IBM Corporation 2007. */
#ifndef __KERNEL__
#include <stdint.h>
#endif
#include <linux/types.h>
#include <linux/virtio_types.h>
@@ -202,7 +199,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p,
vr->num = num;
vr->desc = p;
vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc));
vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16)
vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16)
+ align-1) & ~(align - 1));
}

View File

@@ -63,6 +63,7 @@ enum map_err_types {
* @sg_mapped_ents: 'mapped_ents' from dma_map_sg
* @paddr: physical start address of the mapping
* @map_err_type: track whether dma_mapping_error() was checked
* @is_cache_clean: driver promises not to write to buffer while mapped
* @stack_len: number of backtrace entries in @stack_entries
* @stack_entries: stack of backtrace history
*/
@@ -76,7 +77,8 @@ struct dma_debug_entry {
int sg_call_ents;
int sg_mapped_ents;
phys_addr_t paddr;
enum map_err_types map_err_type;
enum map_err_types map_err_type;
bool is_cache_clean;
#ifdef CONFIG_STACKTRACE
unsigned int stack_len;
unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
@@ -472,12 +474,15 @@ static int active_cacheline_dec_overlap(phys_addr_t cln)
return active_cacheline_set_overlap(cln, --overlap);
}
static int active_cacheline_insert(struct dma_debug_entry *entry)
static int active_cacheline_insert(struct dma_debug_entry *entry,
bool *overlap_cache_clean)
{
phys_addr_t cln = to_cacheline_number(entry);
unsigned long flags;
int rc;
*overlap_cache_clean = false;
/* If the device is not writing memory then we don't have any
* concerns about the cpu consuming stale data. This mitigates
* legitimate usages of overlapping mappings.
@@ -487,8 +492,16 @@ static int active_cacheline_insert(struct dma_debug_entry *entry)
spin_lock_irqsave(&radix_lock, flags);
rc = radix_tree_insert(&dma_active_cacheline, cln, entry);
if (rc == -EEXIST)
if (rc == -EEXIST) {
struct dma_debug_entry *existing;
active_cacheline_inc_overlap(cln);
existing = radix_tree_lookup(&dma_active_cacheline, cln);
/* A lookup failure here after we got -EEXIST is unexpected. */
WARN_ON(!existing);
if (existing)
*overlap_cache_clean = existing->is_cache_clean;
}
spin_unlock_irqrestore(&radix_lock, flags);
return rc;
@@ -583,19 +596,24 @@ DEFINE_SHOW_ATTRIBUTE(dump);
*/
static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
{
bool overlap_cache_clean;
struct hash_bucket *bucket;
unsigned long flags;
int rc;
entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN);
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
put_hash_bucket(bucket, flags);
rc = active_cacheline_insert(entry);
rc = active_cacheline_insert(entry, &overlap_cache_clean);
if (rc == -ENOMEM) {
pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
} else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
} else if (rc == -EEXIST &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
!(entry->is_cache_clean && overlap_cache_clean) &&
!(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
is_swiotlb_active(entry->dev))) {
err_printk(entry->dev, entry,

View File

@@ -17,6 +17,7 @@
#include <linux/virtio_ids.h>
#include <linux/virtio_config.h>
#include <linux/virtio_vsock.h>
#include <linux/dma-mapping.h>
#include <net/sock.h>
#include <linux/mutex.h>
#include <net/af_vsock.h>
@@ -54,13 +55,6 @@ struct virtio_vsock {
int rx_buf_nr;
int rx_buf_max_nr;
/* The following fields are protected by event_lock.
* vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
*/
struct mutex event_lock;
bool event_run;
struct virtio_vsock_event event_list[8];
u32 guest_cid;
bool seqpacket_allow;
@@ -74,6 +68,15 @@ struct virtio_vsock {
*/
struct scatterlist *out_sgs[MAX_SKB_FRAGS + 1];
struct scatterlist out_bufs[MAX_SKB_FRAGS + 1];
/* The following fields are protected by event_lock.
* vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
*/
struct mutex event_lock;
bool event_run;
__dma_from_device_group_begin();
struct virtio_vsock_event event_list[8];
__dma_from_device_group_end();
};
static u32 virtio_transport_get_local_cid(void)
@@ -390,7 +393,7 @@ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
sg_init_one(&sg, event, sizeof(*event));
return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL);
return virtqueue_add_inbuf_cache_clean(vq, &sg, 1, event, GFP_KERNEL);
}
/* event_lock must be held */

View File

@@ -1102,7 +1102,9 @@ our $declaration_macros = qr{(?x:
(?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
(?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(|
(?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\(|
(?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\(
(?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\(|
__cacheline_group_(?:begin|end)(?:_aligned)?\s*\(|
__dma_from_device_group_(?:begin|end)\s*\(
)};
our %allow_repeated_words = (